test_llava_next_video.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256
  1. from typing import List, Optional, Tuple, Type, overload
  2. import pytest
  3. import transformers
  4. from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
  5. from aphrodite.common.sequence import SampleLogprobs
  6. from aphrodite.multimodal.utils import (rescale_video_size, resize_video,
  7. sample_frames_from_video)
  8. from ..conftest import VIDEO_ASSETS, AphroditeRunner, HfRunner, _VideoAssets
  9. from .utils import check_logprobs_close
  10. pytestmark = pytest.mark.vlm
  11. _PREFACE = (
  12. "A chat between a curious human and an artificial intelligence assistant. "
  13. "The assistant gives helpful, detailed, and polite answers to the human's "
  14. "questions."
  15. )
  16. HF_VIDEO_PROMPTS = VIDEO_ASSETS.prompts(
  17. {
  18. "sample_demo_1": f"{_PREFACE}USER: <video>\nWhy is this video funny? "
  19. "ASSISTANT:"
  20. }
  21. )
  22. models = ["llava-hf/LLaVA-NeXT-Video-7B-hf"]
  23. def aphrodite_to_hf_output(
  24. aphrodite_output: Tuple[List[int],
  25. str, Optional[SampleLogprobs]], model: str
  26. ):
  27. """Sanitize aphrodite output to be comparable with hf output."""
  28. output_ids, output_str, out_logprobs = aphrodite_output
  29. config = AutoConfig.from_pretrained(model)
  30. video_token_id = config.video_token_index
  31. tokenizer = AutoTokenizer.from_pretrained(model)
  32. eos_token_id = tokenizer.eos_token_id
  33. hf_output_ids = [
  34. token_id
  35. for idx, token_id in enumerate(output_ids)
  36. if token_id != video_token_id or output_ids[idx - 1] != video_token_id
  37. ]
  38. assert output_str[0] == " "
  39. hf_output_str = output_str[1:]
  40. if hf_output_ids[-1] == eos_token_id:
  41. hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
  42. return hf_output_ids, hf_output_str, out_logprobs
  43. @overload
  44. def run_test(
  45. hf_runner: Type[HfRunner],
  46. aphrodite_runner: Type[AphroditeRunner],
  47. video_assets: _VideoAssets,
  48. model: str,
  49. *,
  50. size_factors: List[float],
  51. dtype: str,
  52. max_tokens: int,
  53. num_logprobs: int,
  54. num_frames: int,
  55. tensor_parallel_size: int,
  56. distributed_executor_backend: Optional[str] = None,
  57. ):
  58. ...
  59. @overload
  60. def run_test(
  61. hf_runner: Type[HfRunner],
  62. aphrodite_runner: Type[AphroditeRunner],
  63. video_assets: _VideoAssets,
  64. model: str,
  65. *,
  66. sizes: List[Tuple[int, int]],
  67. dtype: str,
  68. max_tokens: int,
  69. num_logprobs: int,
  70. num_frames: int,
  71. tensor_parallel_size: int,
  72. distributed_executor_backend: Optional[str] = None,
  73. ):
  74. ...
  75. def run_test(
  76. hf_runner: Type[HfRunner],
  77. aphrodite_runner: Type[AphroditeRunner],
  78. video_assets: _VideoAssets,
  79. model: str,
  80. *,
  81. size_factors: Optional[List[float]] = None,
  82. sizes: Optional[List[Tuple[int, int]]] = None,
  83. dtype: str,
  84. max_tokens: int,
  85. num_logprobs: int,
  86. num_frames: int,
  87. tensor_parallel_size: int,
  88. distributed_executor_backend: Optional[str] = None,
  89. ):
  90. videos = [
  91. sample_frames_from_video(asset.np_ndarrays, num_frames)
  92. for asset in video_assets
  93. ]
  94. for video in videos:
  95. print(video.shape)
  96. if size_factors is not None:
  97. inputs_per_video = [
  98. (
  99. [prompt for _ in size_factors],
  100. [rescale_video_size(video, factor) for factor in size_factors],
  101. )
  102. for video, prompt in zip(videos, HF_VIDEO_PROMPTS)
  103. ]
  104. elif sizes is not None:
  105. inputs_per_video = [
  106. (
  107. [prompt for _ in sizes],
  108. [resize_video(video, size) for size in sizes],
  109. )
  110. for video, prompt in zip(videos, HF_VIDEO_PROMPTS)
  111. ]
  112. else:
  113. raise ValueError("You must provide either `size_factors` or `sizes`")
  114. # max_model_len should be greater than image_feature_size
  115. with aphrodite_runner(
  116. model,
  117. dtype=dtype,
  118. max_model_len=4096,
  119. tensor_parallel_size=tensor_parallel_size,
  120. distributed_executor_backend=distributed_executor_backend,
  121. enforce_eager=True,
  122. ) as aphrodite_model:
  123. aphrodite_outputs_per_video = [
  124. aphrodite_model.generate_greedy_logprobs(
  125. prompts, max_tokens, num_logprobs=num_logprobs, videos=videos
  126. )
  127. for prompts, videos in inputs_per_video
  128. ]
  129. with hf_runner(
  130. model, dtype=dtype, auto_cls=AutoModelForVision2Seq
  131. ) as hf_model:
  132. hf_outputs_per_video = [
  133. hf_model.generate_greedy_logprobs_limit(
  134. prompts, max_tokens, num_logprobs=num_logprobs, videos=videos
  135. )
  136. for prompts, videos in inputs_per_video
  137. ]
  138. for hf_outputs, aphrodite_outputs in zip(
  139. hf_outputs_per_video, aphrodite_outputs_per_video
  140. ):
  141. # TODO: Check whether using original CLIPVisionModel can improve
  142. # consistency against HF
  143. check_logprobs_close(
  144. outputs_0_lst=hf_outputs,
  145. outputs_1_lst=[
  146. aphrodite_to_hf_output(aphrodite_output, model)
  147. for aphrodite_output in aphrodite_outputs
  148. ],
  149. name_0="hf",
  150. name_1="aphrodite",
  151. )
  152. @pytest.mark.skipif(
  153. transformers.__version__ < "4.45",
  154. reason="Waiting for next transformers release",
  155. )
  156. @pytest.mark.parametrize("model", models)
  157. @pytest.mark.parametrize(
  158. "size_factors",
  159. [
  160. # No video
  161. [],
  162. # Single-scale
  163. [1.0],
  164. # Single-scale, batched
  165. [1.0, 1.0, 1.0],
  166. # Multi-scale
  167. [0.25, 0.5, 1.0],
  168. ],
  169. )
  170. @pytest.mark.parametrize("dtype", ["half"])
  171. @pytest.mark.parametrize("max_tokens", [128])
  172. @pytest.mark.parametrize("num_logprobs", [5])
  173. @pytest.mark.parametrize("num_frames", [16])
  174. def test_models(
  175. hf_runner,
  176. aphrodite_runner,
  177. video_assets,
  178. model,
  179. size_factors,
  180. dtype,
  181. max_tokens,
  182. num_logprobs,
  183. num_frames,
  184. ) -> None:
  185. """Inference result should be the same between hf and aphrodite.
  186. All the image fixtures for the test is under tests/videos.
  187. For huggingface runner, we provide the np.ndarray as input.
  188. For aphrodite runner, we provide MultiModalDataDict objects
  189. and corresponding MultiModalConfig as input.
  190. Note, the text input is also adjusted to abide by aphrodite contract.
  191. The text output is sanitized to be able to compare with hf.
  192. """
  193. run_test(
  194. hf_runner,
  195. aphrodite_runner,
  196. video_assets,
  197. model,
  198. size_factors=size_factors,
  199. dtype=dtype,
  200. max_tokens=max_tokens,
  201. num_logprobs=num_logprobs,
  202. num_frames=num_frames,
  203. tensor_parallel_size=1,
  204. )
  205. @pytest.mark.skipif(
  206. transformers.__version__ < "4.45",
  207. reason="Waiting for next transformers release",
  208. )
  209. @pytest.mark.parametrize("model", models)
  210. @pytest.mark.parametrize(
  211. "sizes",
  212. [[(1669, 2560), (2560, 1669), (183, 488), (488, 183)]],
  213. )
  214. @pytest.mark.parametrize("dtype", ["half"])
  215. @pytest.mark.parametrize("max_tokens", [128])
  216. @pytest.mark.parametrize("num_logprobs", [5])
  217. @pytest.mark.parametrize("num_frames", [16])
  218. def test_models_fixed_sizes(
  219. hf_runner,
  220. aphrodite_runner,
  221. video_assets,
  222. model,
  223. sizes,
  224. dtype,
  225. max_tokens,
  226. num_logprobs,
  227. num_frames,
  228. ) -> None:
  229. run_test(
  230. hf_runner,
  231. aphrodite_runner,
  232. video_assets,
  233. model,
  234. sizes=sizes,
  235. dtype=dtype,
  236. max_tokens=max_tokens,
  237. num_logprobs=num_logprobs,
  238. num_frames=num_frames,
  239. tensor_parallel_size=1,
  240. )