test_llava_next_video.py 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234
  1. from typing import List, Optional, Tuple, Type, overload
  2. import pytest
  3. import transformers
  4. from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
  5. from aphrodite.common.sequence import SampleLogprobs
  6. from aphrodite.multimodal.utils import (rescale_video_size, resize_video,
  7. sample_frames_from_video)
  8. from ....conftest import VIDEO_ASSETS, AphroditeRunner, HfRunner, _VideoAssets
  9. from ...utils import check_logprobs_close
  10. _PREFACE = (
  11. "A chat between a curious human and an artificial intelligence assistant. "
  12. "The assistant gives helpful, detailed, and polite answers to the human's "
  13. "questions.")
  14. HF_VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
  15. "sample_demo_1":
  16. f"{_PREFACE}USER: <video>\nWhy is this video funny? ASSISTANT:"
  17. })
  18. models = ["llava-hf/LLaVA-NeXT-Video-7B-hf"]
  19. def aphrodite_to_hf_output(aphrodite_output: Tuple[List[int], str,
  20. Optional[SampleLogprobs]],
  21. model: str):
  22. """Sanitize aphrodite output to be comparable with hf output."""
  23. output_ids, output_str, out_logprobs = aphrodite_output
  24. config = AutoConfig.from_pretrained(model)
  25. video_token_id = config.video_token_index
  26. tokenizer = AutoTokenizer.from_pretrained(model)
  27. eos_token_id = tokenizer.eos_token_id
  28. hf_output_ids = [
  29. token_id for idx, token_id in enumerate(output_ids)
  30. if token_id != video_token_id or output_ids[idx - 1] != video_token_id
  31. ]
  32. assert output_str[0] == " "
  33. hf_output_str = output_str[1:]
  34. if hf_output_ids[-1] == eos_token_id:
  35. hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
  36. return hf_output_ids, hf_output_str, out_logprobs
  37. @overload
  38. def run_test(
  39. hf_runner: Type[HfRunner],
  40. aphrodite_runner: Type[AphroditeRunner],
  41. video_assets: _VideoAssets,
  42. model: str,
  43. *,
  44. size_factors: List[float],
  45. dtype: str,
  46. max_tokens: int,
  47. num_logprobs: int,
  48. num_frames: int,
  49. tensor_parallel_size: int,
  50. distributed_executor_backend: Optional[str] = None,
  51. ):
  52. ...
  53. @overload
  54. def run_test(
  55. hf_runner: Type[HfRunner],
  56. aphrodite_runner: Type[AphroditeRunner],
  57. video_assets: _VideoAssets,
  58. model: str,
  59. *,
  60. sizes: List[Tuple[int, int]],
  61. dtype: str,
  62. max_tokens: int,
  63. num_logprobs: int,
  64. num_frames: int,
  65. tensor_parallel_size: int,
  66. distributed_executor_backend: Optional[str] = None,
  67. ):
  68. ...
  69. def run_test(
  70. hf_runner: Type[HfRunner],
  71. aphrodite_runner: Type[AphroditeRunner],
  72. video_assets: _VideoAssets,
  73. model: str,
  74. *,
  75. size_factors: Optional[List[float]] = None,
  76. sizes: Optional[List[Tuple[int, int]]] = None,
  77. dtype: str,
  78. max_tokens: int,
  79. num_logprobs: int,
  80. num_frames: int,
  81. tensor_parallel_size: int,
  82. distributed_executor_backend: Optional[str] = None,
  83. ):
  84. videos = [
  85. sample_frames_from_video(asset.np_ndarrays, num_frames)
  86. for asset in video_assets
  87. ]
  88. for video in videos:
  89. print(video.shape)
  90. if size_factors is not None:
  91. inputs_per_video = [(
  92. [prompt for _ in size_factors],
  93. [rescale_video_size(video, factor) for factor in size_factors],
  94. ) for video, prompt in zip(videos, HF_VIDEO_PROMPTS)]
  95. elif sizes is not None:
  96. inputs_per_video = [(
  97. [prompt for _ in sizes],
  98. [resize_video(video, size) for size in sizes],
  99. ) for video, prompt in zip(videos, HF_VIDEO_PROMPTS)]
  100. else:
  101. raise ValueError("You must provide either `size_factors` or `sizes`")
  102. # max_model_len should be greater than image_feature_size
  103. with aphrodite_runner(model,
  104. dtype=dtype,
  105. max_model_len=4096,
  106. tensor_parallel_size=tensor_parallel_size,
  107. distributed_executor_backend=distributed_executor_backend,
  108. enforce_eager=True) as aphrodite_model:
  109. aphrodite_outputs_per_video = [
  110. aphrodite_model.generate_greedy_logprobs(prompts,
  111. max_tokens,
  112. num_logprobs=num_logprobs,
  113. videos=videos)
  114. for prompts, videos in inputs_per_video
  115. ]
  116. with hf_runner(model, dtype=dtype,
  117. auto_cls=AutoModelForVision2Seq) as hf_model:
  118. hf_outputs_per_video = [
  119. hf_model.generate_greedy_logprobs_limit(prompts,
  120. max_tokens,
  121. num_logprobs=num_logprobs,
  122. videos=videos)
  123. for prompts, videos in inputs_per_video
  124. ]
  125. for hf_outputs, aphrodite_outputs in zip(hf_outputs_per_video,
  126. aphrodite_outputs_per_video):
  127. # TODO: Check whether using original CLIPVisionModel can improve
  128. # consistency against HF
  129. check_logprobs_close(
  130. outputs_0_lst=hf_outputs,
  131. outputs_1_lst=[
  132. aphrodite_to_hf_output(aphrodite_output, model)
  133. for aphrodite_output in aphrodite_outputs
  134. ],
  135. name_0="hf",
  136. name_1="aphrodite",
  137. )
  138. @pytest.mark.skipif(transformers.__version__ < "4.45",
  139. reason="Waiting for next transformers release")
  140. @pytest.mark.parametrize("model", models)
  141. @pytest.mark.parametrize(
  142. "size_factors",
  143. [
  144. # No video
  145. [],
  146. # Single-scale
  147. [1.0],
  148. # Single-scale, batched
  149. [1.0, 1.0, 1.0],
  150. # Multi-scale
  151. [0.25, 0.5, 1.0],
  152. ],
  153. )
  154. @pytest.mark.parametrize("dtype", ["half"])
  155. @pytest.mark.parametrize("max_tokens", [128])
  156. @pytest.mark.parametrize("num_logprobs", [5])
  157. @pytest.mark.parametrize("num_frames", [16])
  158. def test_models(hf_runner, aphrodite_runner, video_assets, model, size_factors,
  159. dtype, max_tokens, num_logprobs, num_frames) -> None:
  160. """Inference result should be the same between hf and aphrodite.
  161. All the image fixtures for the test is under tests/videos.
  162. For huggingface runner, we provide the np.ndarray as input.
  163. For aphrodite runner, we provide MultiModalDataDict objects
  164. and corresponding MultiModalConfig as input.
  165. Note, the text input is also adjusted to abide by aphrodite contract.
  166. The text output is sanitized to be able to compare with hf.
  167. """
  168. run_test(
  169. hf_runner,
  170. aphrodite_runner,
  171. video_assets,
  172. model,
  173. size_factors=size_factors,
  174. dtype=dtype,
  175. max_tokens=max_tokens,
  176. num_logprobs=num_logprobs,
  177. num_frames=num_frames,
  178. tensor_parallel_size=1,
  179. )
  180. @pytest.mark.skipif(transformers.__version__ < "4.45",
  181. reason="Waiting for next transformers release")
  182. @pytest.mark.parametrize("model", models)
  183. @pytest.mark.parametrize(
  184. "sizes",
  185. [[(1669, 2560), (2560, 1669), (183, 488), (488, 183)]],
  186. )
  187. @pytest.mark.parametrize("dtype", ["half"])
  188. @pytest.mark.parametrize("max_tokens", [128])
  189. @pytest.mark.parametrize("num_logprobs", [5])
  190. @pytest.mark.parametrize("num_frames", [16])
  191. def test_models_fixed_sizes(hf_runner, aphrodite_runner, video_assets, model,
  192. sizes, dtype, max_tokens, num_logprobs,
  193. num_frames) -> None:
  194. run_test(
  195. hf_runner,
  196. aphrodite_runner,
  197. video_assets,
  198. model,
  199. sizes=sizes,
  200. dtype=dtype,
  201. max_tokens=max_tokens,
  202. num_logprobs=num_logprobs,
  203. num_frames=num_frames,
  204. tensor_parallel_size=1,
  205. )