test_ultravox.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
  1. from typing import List, Optional, Tuple, Type
  2. import librosa
  3. import numpy as np
  4. import pytest
  5. from transformers import AutoModel, AutoTokenizer, BatchEncoding
  6. from aphrodite.assets.audio import AudioAsset
  7. from aphrodite.common.sequence import SampleLogprobs
  8. from aphrodite.common.utils import STR_DTYPE_TO_TORCH_DTYPE
  9. from ..conftest import HfRunner, AphroditeRunner
  10. from .utils import check_logprobs_close
  11. pytestmark = pytest.mark.vlm
  12. MODEL_NAME = "fixie-ai/ultravox-v0_3"
  13. AudioTuple = Tuple[np.ndarray, int]
  14. @pytest.fixture(scope="session")
  15. def audio_and_sample_rate():
  16. return AudioAsset("mary_had_lamb").audio_and_sample_rate
  17. @pytest.fixture
  18. def prompts_and_audios(audio_and_sample_rate):
  19. tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
  20. aphrodite_placeholder = "<|reserved_special_token_0|>"
  21. hf_placeholder = "<|audio|>"
  22. question = "What's in the audio?"
  23. aphrodite_prompt = tokenizer.apply_chat_template(
  24. [{
  25. 'role': 'user',
  26. 'content': f"{aphrodite_placeholder}\n{question}"
  27. }],
  28. tokenize=False,
  29. add_generation_prompt=True)
  30. hf_prompt = tokenizer.apply_chat_template(
  31. [{
  32. 'role': 'user',
  33. 'content': f"{hf_placeholder}\n{question}"
  34. }],
  35. tokenize=False,
  36. add_generation_prompt=True)
  37. return [(aphrodite_prompt, hf_prompt, audio_and_sample_rate)]
  38. def aphrodite_to_hf_output(aphrodite_output: Tuple[List[int], str,
  39. Optional[SampleLogprobs]],
  40. model: str):
  41. """Sanitize aphrodite output to be comparable with hf output."""
  42. output_ids, output_str, out_logprobs = aphrodite_output
  43. tokenizer = AutoTokenizer.from_pretrained(model)
  44. eos_token_id = tokenizer.eos_token_id
  45. hf_output_ids = output_ids[:]
  46. hf_output_str = output_str
  47. if hf_output_ids[-1] == eos_token_id:
  48. hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
  49. return hf_output_ids, hf_output_str, out_logprobs
  50. def run_test(
  51. hf_runner: Type[HfRunner],
  52. aphrodite_runner: Type[AphroditeRunner],
  53. prompts_and_audios: List[Tuple[str, str, AudioTuple]],
  54. model: str,
  55. *,
  56. dtype: str,
  57. max_tokens: int,
  58. num_logprobs: int,
  59. tensor_parallel_size: int,
  60. distributed_executor_backend: Optional[str] = None,
  61. ):
  62. """Inference result should be the same between hf and aphrodite."""
  63. torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
  64. # NOTE: take care of the order. run Aphrodite first, and then run HF.
  65. # Aphrodite needs a fresh new process without cuda initialization.
  66. # if we run HF first, the cuda initialization will be done and it
  67. # will hurt multiprocessing backend with fork method (the default method).
  68. with aphrodite_runner(model,
  69. dtype=dtype,
  70. tensor_parallel_size=tensor_parallel_size,
  71. distributed_executor_backend=distributed_executor_backend,
  72. enforce_eager=True) as aphrodite_model:
  73. aphrodite_outputs_per_audio = [
  74. aphrodite_model.generate_greedy_logprobs([aphrodite_prompt],
  75. max_tokens,
  76. num_logprobs=num_logprobs,
  77. audios=[audio])
  78. for aphrodite_prompt, _, audio in prompts_and_audios
  79. ]
  80. def process(hf_inputs: BatchEncoding):
  81. hf_inputs["audio_values"] = hf_inputs["audio_values"] \
  82. .to(torch_dtype) # type: ignore
  83. return hf_inputs
  84. with hf_runner(model,
  85. dtype=dtype,
  86. postprocess_inputs=process,
  87. auto_cls=AutoModel) as hf_model:
  88. hf_outputs_per_audio = [
  89. hf_model.generate_greedy_logprobs_limit(
  90. [hf_prompt],
  91. max_tokens,
  92. num_logprobs=num_logprobs,
  93. audios=[(librosa.resample(audio[0],
  94. orig_sr=audio[1],
  95. target_sr=16000), 16000)])
  96. for _, hf_prompt, audio in prompts_and_audios
  97. ]
  98. for hf_outputs, aphrodite_outputs in zip(hf_outputs_per_audio,
  99. aphrodite_outputs_per_audio):
  100. check_logprobs_close(
  101. outputs_0_lst=hf_outputs,
  102. outputs_1_lst=[
  103. aphrodite_to_hf_output(aphrodite_output, model)
  104. for aphrodite_output in aphrodite_outputs
  105. ],
  106. name_0="hf",
  107. name_1="aphrodite",
  108. )
  109. @pytest.mark.parametrize("dtype", ["half"])
  110. @pytest.mark.parametrize("max_tokens", [128])
  111. @pytest.mark.parametrize("num_logprobs", [5])
  112. def test_models(hf_runner, aphrodite_runner, prompts_and_audios, dtype: str,
  113. max_tokens: int, num_logprobs: int) -> None:
  114. run_test(
  115. hf_runner,
  116. aphrodite_runner,
  117. prompts_and_audios,
  118. MODEL_NAME,
  119. dtype=dtype,
  120. max_tokens=max_tokens,
  121. num_logprobs=num_logprobs,
  122. tensor_parallel_size=1,
  123. )