audio_example.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111
  1. """
  2. This example shows how to use vLLM for running offline inference
  3. with the correct prompt format on vision language models.
  4. For most models, the prompt format should follow corresponding examples
  5. on HuggingFace model repository.
  6. """
  7. from transformers import AutoTokenizer
  8. from aphrodite import LLM, SamplingParams
  9. from aphrodite.assets.audio import AudioAsset
  10. from aphrodite.common.utils import FlexibleArgumentParser
  11. # Input audio and question
  12. # audio_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
  13. # "mary_had_lamb.ogg")
  14. # audio_and_sample_rate = librosa.load(audio_path, sr=None)
  15. audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
  16. question_per_audio_count = [
  17. "What is recited in the audio?",
  18. "What sport and what nursery rhyme are referenced?"
  19. ]
  20. # Ultravox 0.3
  21. def run_ultravox(question, audio_count):
  22. model_name = "fixie-ai/ultravox-v0_3"
  23. tokenizer = AutoTokenizer.from_pretrained(model_name)
  24. messages = [{
  25. 'role':
  26. 'user',
  27. 'content':
  28. "<|reserved_special_token_0|>\n" * audio_count + question
  29. }]
  30. prompt = tokenizer.apply_chat_template(messages,
  31. tokenize=False,
  32. add_generation_prompt=True)
  33. llm = LLM(model=model_name,
  34. enforce_eager=True,
  35. enable_chunked_prefill=False,
  36. max_model_len=8192,
  37. limit_mm_per_prompt={"audio": audio_count})
  38. stop_token_ids = None
  39. return llm, prompt, stop_token_ids
  40. model_example_map = {
  41. "ultravox": run_ultravox,
  42. }
  43. def main(args):
  44. model = args.model_type
  45. if model not in model_example_map:
  46. raise ValueError(f"Model type {model} is not supported.")
  47. audio_count = args.num_audios
  48. llm, prompt, stop_token_ids = model_example_map[model](
  49. question_per_audio_count[audio_count - 1], audio_count)
  50. # We set temperature to 0.2 so that outputs can be different
  51. # even when all prompts are identical when running batch inference.
  52. sampling_params = SamplingParams(temperature=0.2,
  53. max_tokens=64,
  54. stop_token_ids=stop_token_ids)
  55. assert args.num_prompts > 0
  56. inputs = {
  57. "prompt": prompt,
  58. "multi_modal_data": {
  59. "audio": [
  60. asset.audio_and_sample_rate
  61. for asset in audio_assets[:audio_count]
  62. ]
  63. },
  64. }
  65. if args.num_prompts > 1:
  66. # Batch inference
  67. inputs = [inputs] * args.num_prompts
  68. outputs = llm.generate(inputs, sampling_params=sampling_params)
  69. for o in outputs:
  70. generated_text = o.outputs[0].text
  71. print(generated_text)
  72. if __name__ == "__main__":
  73. parser = FlexibleArgumentParser(
  74. description='Demo on using Aphrodite for offline inference with '
  75. 'audio language models')
  76. parser.add_argument('--model-type',
  77. '-m',
  78. type=str,
  79. default="ultravox",
  80. choices=model_example_map.keys(),
  81. help='Huggingface "model_type".')
  82. parser.add_argument('--num-prompts',
  83. type=int,
  84. default=1,
  85. help='Number of prompts to run.')
  86. parser.add_argument("--num-audios",
  87. type=int,
  88. default=1,
  89. choices=[1, 2],
  90. help="Number of audio items per prompt.")
  91. args = parser.parse_args()
  92. main(args)