test_ultravox.py 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201
  1. from typing import List, Optional, Tuple, Type
  2. import numpy as np
  3. import pytest
  4. from transformers import AutoModel, AutoTokenizer, BatchEncoding
  5. from aphrodite.common.sequence import SampleLogprobs
  6. from aphrodite.common.utils import STR_DTYPE_TO_TORCH_DTYPE
  7. from ....conftest import AphroditeRunner, HfRunner
  8. from ...utils import check_logprobs_close
  9. MODEL_NAME = "fixie-ai/ultravox-v0_3"
  10. AudioTuple = Tuple[np.ndarray, int]
  11. APHRODITE_PLACEHOLDER = "<|reserved_special_token_0|>"
  12. HF_PLACEHOLDER = "<|audio|>"
  13. @pytest.fixture(scope="session")
  14. def audio_assets():
  15. from aphrodite.assets.audio import AudioAsset
  16. return [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
  17. @pytest.fixture(scope="module", params=("mary_had_lamb", "winning_call"))
  18. def audio(request):
  19. from aphrodite.assets.audio import AudioAsset
  20. return AudioAsset(request.param)
  21. def _get_prompt(audio_count, question, placeholder):
  22. tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
  23. placeholder = f"{placeholder}\n" * audio_count
  24. return tokenizer.apply_chat_template([{
  25. 'role': 'user',
  26. 'content': f"{placeholder}{question}"
  27. }],
  28. tokenize=False,
  29. add_generation_prompt=True)
  30. def aphrodite_to_hf_output(aphrodite_output: Tuple[List[int], str,
  31. Optional[SampleLogprobs]],
  32. model: str):
  33. """Sanitize aphrodite output to be comparable with hf output."""
  34. output_ids, output_str, out_logprobs = aphrodite_output
  35. tokenizer = AutoTokenizer.from_pretrained(model)
  36. eos_token_id = tokenizer.eos_token_id
  37. hf_output_ids = output_ids[:]
  38. hf_output_str = output_str
  39. if hf_output_ids[-1] == eos_token_id:
  40. hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
  41. return hf_output_ids, hf_output_str, out_logprobs
  42. def run_test(
  43. hf_runner: Type[HfRunner],
  44. aphrodite_runner: Type[AphroditeRunner],
  45. prompts_and_audios: List[Tuple[str, str, AudioTuple]],
  46. model: str,
  47. *,
  48. dtype: str,
  49. max_tokens: int,
  50. num_logprobs: int,
  51. tensor_parallel_size: int,
  52. distributed_executor_backend: Optional[str] = None,
  53. ):
  54. """Inference result should be the same between hf and aphrodite."""
  55. torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
  56. # NOTE: take care of the order. run Aphrodite first, and then run HF.
  57. # Aphrodite needs a fresh new process without cuda initialization.
  58. # if we run HF first, the cuda initialization will be done and it
  59. # will hurt multiprocessing backend with fork method (the default method).
  60. with aphrodite_runner(model,
  61. dtype=dtype,
  62. tensor_parallel_size=tensor_parallel_size,
  63. distributed_executor_backend=distributed_executor_backend,
  64. enforce_eager=True) as aphrodite_model:
  65. aphrodite_outputs_per_audio = [
  66. aphrodite_model.generate_greedy_logprobs([aphrodite_prompt],
  67. max_tokens,
  68. num_logprobs=num_logprobs,
  69. audios=[audio])
  70. for aphrodite_prompt, _, audio in prompts_and_audios
  71. ]
  72. def process(hf_inputs: BatchEncoding):
  73. hf_inputs["audio_values"] = hf_inputs["audio_values"] \
  74. .to(torch_dtype) # type: ignore
  75. return hf_inputs
  76. with hf_runner(model,
  77. dtype=dtype,
  78. postprocess_inputs=process,
  79. auto_cls=AutoModel) as hf_model:
  80. import librosa
  81. hf_outputs_per_audio = [
  82. hf_model.generate_greedy_logprobs_limit(
  83. [hf_prompt],
  84. max_tokens,
  85. num_logprobs=num_logprobs,
  86. audios=[(librosa.resample(audio[0],
  87. orig_sr=audio[1],
  88. target_sr=16000), 16000)])
  89. for _, hf_prompt, audio in prompts_and_audios
  90. ]
  91. for hf_outputs, aphrodite_outputs in zip(hf_outputs_per_audio,
  92. aphrodite_outputs_per_audio):
  93. check_logprobs_close(
  94. outputs_0_lst=hf_outputs,
  95. outputs_1_lst=[
  96. aphrodite_to_hf_output(aphrodite_output, model)
  97. for aphrodite_output in aphrodite_outputs
  98. ],
  99. name_0="hf",
  100. name_1="aphrodite",
  101. )
  102. def run_multi_audio_test(
  103. aphrodite_runner: Type[AphroditeRunner],
  104. prompts_and_audios: List[Tuple[str, List[AudioTuple]]],
  105. model: str,
  106. *,
  107. dtype: str,
  108. max_tokens: int,
  109. num_logprobs: int,
  110. tensor_parallel_size: int,
  111. distributed_executor_backend: Optional[str] = None,
  112. ):
  113. with aphrodite_runner(model,
  114. dtype=dtype,
  115. tensor_parallel_size=tensor_parallel_size,
  116. distributed_executor_backend=distributed_executor_backend,
  117. enforce_eager=True,
  118. limit_mm_per_prompt={
  119. "audio":
  120. max((len(audio) for _, audio in prompts_and_audios))
  121. }) as aphrodite_model:
  122. aphrodite_outputs = aphrodite_model.generate_greedy_logprobs(
  123. [prompt for prompt, _ in prompts_and_audios],
  124. max_tokens,
  125. num_logprobs=num_logprobs,
  126. audios=[audios for _, audios in prompts_and_audios])
  127. # The HuggingFace model doesn't support multiple audios yet, so
  128. # just assert that some tokens were generated.
  129. assert all(tokens for tokens, *_ in aphrodite_outputs)
  130. @pytest.mark.parametrize("dtype", ["half"])
  131. @pytest.mark.parametrize("max_tokens", [128])
  132. @pytest.mark.parametrize("num_logprobs", [5])
  133. def test_models(hf_runner, aphrodite_runner, audio, dtype: str, max_tokens: int,
  134. num_logprobs: int) -> None:
  135. aphrodite_prompt = _get_prompt(
  136. 1, "Describe the audio above.", APHRODITE_PLACEHOLDER)
  137. hf_prompt = _get_prompt(1, "Describe the audio above.", HF_PLACEHOLDER)
  138. run_test(
  139. hf_runner,
  140. aphrodite_runner,
  141. [(aphrodite_prompt, hf_prompt, audio.audio_and_sample_rate)],
  142. MODEL_NAME,
  143. dtype=dtype,
  144. max_tokens=max_tokens,
  145. num_logprobs=num_logprobs,
  146. tensor_parallel_size=1,
  147. )
  148. @pytest.mark.parametrize("dtype", ["half"])
  149. @pytest.mark.parametrize("max_tokens", [128])
  150. @pytest.mark.parametrize("num_logprobs", [5])
  151. def test_models_with_multiple_audios(aphrodite_runner, audio_assets, dtype: str,
  152. max_tokens: int,
  153. num_logprobs: int) -> None:
  154. aphrodite_prompt = _get_prompt(len(audio_assets),
  155. "Describe each of the audios above.",
  156. APHRODITE_PLACEHOLDER)
  157. run_multi_audio_test(
  158. aphrodite_runner,
  159. [(aphrodite_prompt, [audio.audio_and_sample_rate
  160. for audio in audio_assets])],
  161. MODEL_NAME,
  162. dtype=dtype,
  163. max_tokens=max_tokens,
  164. num_logprobs=num_logprobs,
  165. tensor_parallel_size=1,
  166. )