test_ultravox.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. from typing import List, Optional, Tuple, Type
  2. import numpy as np
  3. import pytest
  4. from transformers import AutoModel, AutoTokenizer, BatchEncoding
  5. from aphrodite.common.sequence import SampleLogprobs
  6. from aphrodite.common.utils import STR_DTYPE_TO_TORCH_DTYPE
  7. from ..conftest import AphroditeRunner, HfRunner
  8. from .utils import check_logprobs_close
  9. pytestmark = pytest.mark.vlm
  10. MODEL_NAME = "fixie-ai/ultravox-v0_3"
  11. AudioTuple = Tuple[np.ndarray, int]
  12. APHRODITE_PLACEHOLDER = "<|reserved_special_token_0|>"
  13. HF_PLACEHOLDER = "<|audio|>"
  14. @pytest.fixture(scope="session")
  15. def audio_assets():
  16. from aphrodite.assets.audio import AudioAsset
  17. return [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
  18. @pytest.fixture(scope="module", params=("mary_had_lamb", "winning_call"))
  19. def audio(request):
  20. from aphrodite.assets.audio import AudioAsset
  21. return AudioAsset(request.param)
  22. def _get_prompt(audio_count, question, placeholder):
  23. tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
  24. placeholder = f"{placeholder}\n" * audio_count
  25. return tokenizer.apply_chat_template([{
  26. 'role': 'user',
  27. 'content': f"{placeholder}{question}"
  28. }],
  29. tokenize=False,
  30. add_generation_prompt=True)
  31. def aphrodite_to_hf_output(aphrodite_output: Tuple[List[int], str,
  32. Optional[SampleLogprobs]],
  33. model: str):
  34. """Sanitize aphrodite output to be comparable with hf output."""
  35. output_ids, output_str, out_logprobs = aphrodite_output
  36. tokenizer = AutoTokenizer.from_pretrained(model)
  37. eos_token_id = tokenizer.eos_token_id
  38. hf_output_ids = output_ids[:]
  39. hf_output_str = output_str
  40. if hf_output_ids[-1] == eos_token_id:
  41. hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
  42. return hf_output_ids, hf_output_str, out_logprobs
  43. def run_test(
  44. hf_runner: Type[HfRunner],
  45. aphrodite_runner: Type[AphroditeRunner],
  46. prompts_and_audios: List[Tuple[str, str, AudioTuple]],
  47. model: str,
  48. *,
  49. dtype: str,
  50. max_tokens: int,
  51. num_logprobs: int,
  52. tensor_parallel_size: int,
  53. distributed_executor_backend: Optional[str] = None,
  54. ):
  55. """Inference result should be the same between hf and aphrodite."""
  56. torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
  57. # NOTE: take care of the order. run Aphrodite first, and then run HF.
  58. # Aphrodite needs a fresh new process without cuda initialization.
  59. # if we run HF first, the cuda initialization will be done and it
  60. # will hurt multiprocessing backend with fork method (the default method).
  61. with aphrodite_runner(model,
  62. dtype=dtype,
  63. tensor_parallel_size=tensor_parallel_size,
  64. distributed_executor_backend=distributed_executor_backend,
  65. enforce_eager=True) as aphrodite_model:
  66. aphrodite_outputs_per_audio = [
  67. aphrodite_model.generate_greedy_logprobs([aphrodite_prompt],
  68. max_tokens,
  69. num_logprobs=num_logprobs,
  70. audios=[audio])
  71. for aphrodite_prompt, _, audio in prompts_and_audios
  72. ]
  73. def process(hf_inputs: BatchEncoding):
  74. hf_inputs["audio_values"] = hf_inputs["audio_values"] \
  75. .to(torch_dtype) # type: ignore
  76. return hf_inputs
  77. with hf_runner(model,
  78. dtype=dtype,
  79. postprocess_inputs=process,
  80. auto_cls=AutoModel) as hf_model:
  81. import librosa
  82. hf_outputs_per_audio = [
  83. hf_model.generate_greedy_logprobs_limit(
  84. [hf_prompt],
  85. max_tokens,
  86. num_logprobs=num_logprobs,
  87. audios=[(librosa.resample(audio[0],
  88. orig_sr=audio[1],
  89. target_sr=16000), 16000)])
  90. for _, hf_prompt, audio in prompts_and_audios
  91. ]
  92. for hf_outputs, aphrodite_outputs in zip(hf_outputs_per_audio,
  93. aphrodite_outputs_per_audio):
  94. check_logprobs_close(
  95. outputs_0_lst=hf_outputs,
  96. outputs_1_lst=[
  97. aphrodite_to_hf_output(aphrodite_output, model)
  98. for aphrodite_output in aphrodite_outputs
  99. ],
  100. name_0="hf",
  101. name_1="aphrodite",
  102. )
  103. def run_multi_audio_test(
  104. aphrodite_runner: Type[AphroditeRunner],
  105. prompts_and_audios: List[Tuple[str, List[AudioTuple]]],
  106. model: str,
  107. *,
  108. dtype: str,
  109. max_tokens: int,
  110. num_logprobs: int,
  111. tensor_parallel_size: int,
  112. distributed_executor_backend: Optional[str] = None,
  113. ):
  114. with aphrodite_runner(model,
  115. dtype=dtype,
  116. tensor_parallel_size=tensor_parallel_size,
  117. distributed_executor_backend=distributed_executor_backend,
  118. enforce_eager=True,
  119. limit_mm_per_prompt={
  120. "audio":
  121. max((len(audio) for _, audio in prompts_and_audios))
  122. }) as aphrodite_model:
  123. aphrodite_outputs = aphrodite_model.generate_greedy_logprobs(
  124. [prompt for prompt, _ in prompts_and_audios],
  125. max_tokens,
  126. num_logprobs=num_logprobs,
  127. audios=[audios for _, audios in prompts_and_audios])
  128. # The HuggingFace model doesn't support multiple audios yet, so
  129. # just assert that some tokens were generated.
  130. assert all(tokens for tokens, *_ in aphrodite_outputs)
  131. @pytest.mark.parametrize("dtype", ["half"])
  132. @pytest.mark.parametrize("max_tokens", [128])
  133. @pytest.mark.parametrize("num_logprobs", [5])
  134. def test_models(hf_runner, aphrodite_runner, audio, dtype: str, max_tokens: int,
  135. num_logprobs: int) -> None:
  136. aphrodite_prompt = _get_prompt(1, "Describe the audio above.",
  137. APHRODITE_PLACEHOLDER)
  138. hf_prompt = _get_prompt(1, "Describe the audio above.", HF_PLACEHOLDER)
  139. run_test(
  140. hf_runner,
  141. aphrodite_runner,
  142. [(aphrodite_prompt, hf_prompt, audio.audio_and_sample_rate)],
  143. MODEL_NAME,
  144. dtype=dtype,
  145. max_tokens=max_tokens,
  146. num_logprobs=num_logprobs,
  147. tensor_parallel_size=1,
  148. )
  149. @pytest.mark.parametrize("dtype", ["half"])
  150. @pytest.mark.parametrize("max_tokens", [128])
  151. @pytest.mark.parametrize("num_logprobs", [5])
  152. def test_models_with_multiple_audios(aphrodite_runner, audio_assets, dtype: str,
  153. max_tokens: int,
  154. num_logprobs: int) -> None:
  155. aphrodite_prompt = _get_prompt(len(audio_assets),
  156. "Describe each of the audios above.",
  157. APHRODITE_PLACEHOLDER)
  158. run_multi_audio_test(
  159. aphrodite_runner,
  160. [(aphrodite_prompt, [audio.audio_and_sample_rate
  161. for audio in audio_assets])],
  162. MODEL_NAME,
  163. dtype=dtype,
  164. max_tokens=max_tokens,
  165. num_logprobs=num_logprobs,
  166. tensor_parallel_size=1,
  167. )