test_audio.py 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253
  1. from typing import Dict, List
  2. import openai
  3. import pytest
  4. from aphrodite.assets.audio import AudioAsset
  5. from aphrodite.multimodal.utils import encode_audio_base64, fetch_audio
  6. from ...utils import RemoteOpenAIServer
  7. MODEL_NAME = "fixie-ai/ultravox-v0_3"
  8. TEST_AUDIO_URLS = [
  9. AudioAsset("winning_call").url,
  10. ]
  11. @pytest.fixture(scope="module")
  12. def server():
  13. args = [
  14. "--dtype",
  15. "bfloat16",
  16. "--max-model-len",
  17. "4096",
  18. "--enforce-eager",
  19. ]
  20. with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
  21. yield remote_server
  22. @pytest.fixture(scope="module")
  23. def client(server):
  24. return server.get_async_client()
  25. @pytest.fixture(scope="session")
  26. def base64_encoded_audio() -> Dict[str, str]:
  27. return {
  28. audio_url: encode_audio_base64(*fetch_audio(audio_url))
  29. for audio_url in TEST_AUDIO_URLS
  30. }
  31. @pytest.mark.asyncio
  32. @pytest.mark.parametrize("model_name", [MODEL_NAME])
  33. @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
  34. async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
  35. model_name: str, audio_url: str):
  36. messages = [{
  37. "role":
  38. "user",
  39. "content": [
  40. {
  41. "type": "audio_url",
  42. "audio_url": {
  43. "url": audio_url
  44. }
  45. },
  46. {
  47. "type": "text",
  48. "text": "What's happening in this audio?"
  49. },
  50. ],
  51. }]
  52. # test single completion
  53. chat_completion = await client.chat.completions.create(model=model_name,
  54. messages=messages,
  55. max_tokens=10,
  56. logprobs=True,
  57. top_logprobs=5)
  58. assert len(chat_completion.choices) == 1
  59. choice = chat_completion.choices[0]
  60. assert choice.finish_reason == "length"
  61. assert chat_completion.usage == openai.types.CompletionUsage(
  62. completion_tokens=10, prompt_tokens=202, total_tokens=212)
  63. message = choice.message
  64. message = chat_completion.choices[0].message
  65. assert message.content is not None and len(message.content) >= 10
  66. assert message.role == "assistant"
  67. messages.append({"role": "assistant", "content": message.content})
  68. # test multi-turn dialogue
  69. messages.append({"role": "user", "content": "express your result in json"})
  70. chat_completion = await client.chat.completions.create(
  71. model=model_name,
  72. messages=messages,
  73. max_tokens=10,
  74. )
  75. message = chat_completion.choices[0].message
  76. assert message.content is not None and len(message.content) >= 0
  77. @pytest.mark.asyncio
  78. @pytest.mark.parametrize("model_name", [MODEL_NAME])
  79. @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
  80. async def test_single_chat_session_audio_base64encoded(
  81. client: openai.AsyncOpenAI, model_name: str, audio_url: str,
  82. base64_encoded_audio: Dict[str, str]):
  83. messages = [{
  84. "role":
  85. "user",
  86. "content": [
  87. {
  88. "type": "audio_url",
  89. "audio_url": {
  90. "url":
  91. f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}"
  92. }
  93. },
  94. {
  95. "type": "text",
  96. "text": "What's happening in this audio?"
  97. },
  98. ],
  99. }]
  100. # test single completion
  101. chat_completion = await client.chat.completions.create(model=model_name,
  102. messages=messages,
  103. max_tokens=10,
  104. logprobs=True,
  105. top_logprobs=5)
  106. assert len(chat_completion.choices) == 1
  107. choice = chat_completion.choices[0]
  108. assert choice.finish_reason == "length"
  109. assert chat_completion.usage == openai.types.CompletionUsage(
  110. completion_tokens=10, prompt_tokens=202, total_tokens=212)
  111. message = choice.message
  112. message = chat_completion.choices[0].message
  113. assert message.content is not None and len(message.content) >= 10
  114. assert message.role == "assistant"
  115. messages.append({"role": "assistant", "content": message.content})
  116. # test multi-turn dialogue
  117. messages.append({"role": "user", "content": "express your result in json"})
  118. chat_completion = await client.chat.completions.create(
  119. model=model_name,
  120. messages=messages,
  121. max_tokens=10,
  122. )
  123. message = chat_completion.choices[0].message
  124. assert message.content is not None and len(message.content) >= 0
  125. @pytest.mark.asyncio
  126. @pytest.mark.parametrize("model_name", [MODEL_NAME])
  127. @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
  128. async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
  129. model_name: str, audio_url: str):
  130. messages = [{
  131. "role":
  132. "user",
  133. "content": [
  134. {
  135. "type": "audio_url",
  136. "audio_url": {
  137. "url": audio_url
  138. }
  139. },
  140. {
  141. "type": "text",
  142. "text": "What's happening in this audio?"
  143. },
  144. ],
  145. }]
  146. # test single completion
  147. chat_completion = await client.chat.completions.create(
  148. model=model_name,
  149. messages=messages,
  150. max_tokens=10,
  151. temperature=0.0,
  152. )
  153. output = chat_completion.choices[0].message.content
  154. stop_reason = chat_completion.choices[0].finish_reason
  155. # test streaming
  156. stream = await client.chat.completions.create(
  157. model=model_name,
  158. messages=messages,
  159. max_tokens=10,
  160. temperature=0.0,
  161. stream=True,
  162. )
  163. chunks: List[str] = []
  164. finish_reason_count = 0
  165. async for chunk in stream:
  166. delta = chunk.choices[0].delta
  167. if delta.role:
  168. assert delta.role == "assistant"
  169. if delta.content:
  170. chunks.append(delta.content)
  171. if chunk.choices[0].finish_reason is not None:
  172. finish_reason_count += 1
  173. # finish reason should only return in last block
  174. assert finish_reason_count == 1
  175. assert chunk.choices[0].finish_reason == stop_reason
  176. assert delta.content
  177. assert "".join(chunks) == output
  178. @pytest.mark.asyncio
  179. @pytest.mark.parametrize("model_name", [MODEL_NAME])
  180. @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
  181. async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
  182. audio_url: str):
  183. messages = [{
  184. "role":
  185. "user",
  186. "content": [
  187. {
  188. "type": "audio_url",
  189. "audio_url": {
  190. "url": audio_url
  191. }
  192. },
  193. {
  194. "type": "audio_url",
  195. "audio_url": {
  196. "url": audio_url
  197. }
  198. },
  199. {
  200. "type": "text",
  201. "text": "What's happening in this audio?"
  202. },
  203. ],
  204. }]
  205. with pytest.raises(openai.BadRequestError): # test multi-audio input
  206. await client.chat.completions.create(
  207. model=model_name,
  208. messages=messages,
  209. max_tokens=10,
  210. temperature=0.0,
  211. )
  212. # the server should still work afterwards
  213. completion = await client.completions.create(
  214. model=model_name,
  215. prompt=[0, 0, 0, 0, 0],
  216. max_tokens=5,
  217. temperature=0.0,
  218. )
  219. completion = completion.choices[0].text
  220. assert completion is not None and len(completion) >= 0