video.py 2.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273
  1. from functools import lru_cache
  2. from typing import List, Union
  3. import numpy as np
  4. from loguru import logger
  5. from aphrodite.common.config import ModelConfig
  6. from aphrodite.common.utils import is_list_of
  7. from aphrodite.inputs.registry import InputContext
  8. from aphrodite.transformers_utils.processor import get_video_processor
  9. from aphrodite.transformers_utils.tokenizer import get_tokenizer
  10. from .base import MultiModalData, MultiModalInputs
  11. from .image import ImagePlugin
  12. cached_get_video_processor = lru_cache(get_video_processor)
  13. cached_get_tokenizer = lru_cache(get_tokenizer)
  14. VideoInput = Union[
  15. "np.ndarray", # single video input
  16. List["np.ndarray"],
  17. # TODO: support more types
  18. # List[Image.Image], List[List[Image.Image]],
  19. # "torch.Tensor",
  20. # List["torch.Tensor"],
  21. # List[List["np.ndarrray"]],
  22. # List[List["torch.Tensor"]],
  23. ]
  24. class VideoPlugin(ImagePlugin):
  25. """Plugin for video data."""
  26. def get_data_key(self) -> str:
  27. return "video"
  28. def _get_hf_video_processor(self, model_config: ModelConfig):
  29. mm_processor_kwargs = ({} if model_config.mm_processor_kwargs is None
  30. else model_config.mm_processor_kwargs)
  31. # We don't explicitly check kwarg overrides to the HF class
  32. # since the automodel just takes kwargs, so we can't inspect it
  33. return cached_get_video_processor(
  34. model_config.model,
  35. trust_remote_code=model_config.trust_remote_code,
  36. **mm_processor_kwargs)
  37. def _default_input_mapper(
  38. self,
  39. ctx: InputContext,
  40. data: MultiModalData[object],
  41. ) -> MultiModalInputs:
  42. model_config = ctx.model_config
  43. # single video input as np.ndarray
  44. if isinstance(data, np.ndarray):
  45. video_processor = self._get_hf_video_processor(model_config)
  46. if video_processor is None:
  47. raise RuntimeError(
  48. "No HuggingFace processor is available "
  49. "to process the image object"
  50. )
  51. try:
  52. batch_data = video_processor(data, return_tensors="pt").data
  53. except Exception:
  54. logger.error(f"Failed to process image ({data})")
  55. raise
  56. return MultiModalInputs(batch_data)
  57. elif is_list_of(data, np.ndarray):
  58. raise NotImplementedError(
  59. "Multi video for a prompt is not supported yet"
  60. )
  61. raise TypeError(f"Invalid video type: {type(data)}")
  62. def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
  63. return 4096