serving_embedding.py 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180
  1. import asyncio
  2. import base64
  3. import time
  4. from typing import (AsyncGenerator, AsyncIterator, List, Optional, Tuple,
  5. Union, cast)
  6. import numpy as np
  7. from fastapi import Request
  8. from loguru import logger
  9. from aphrodite.common.config import ModelConfig
  10. from aphrodite.common.outputs import EmbeddingRequestOutput
  11. from aphrodite.common.utils import merge_async_iterators, random_uuid
  12. from aphrodite.endpoints.logger import RequestLogger
  13. from aphrodite.endpoints.openai.protocol import (EmbeddingRequest,
  14. EmbeddingResponse,
  15. EmbeddingResponseData,
  16. ErrorResponse, UsageInfo)
  17. from aphrodite.endpoints.openai.serving_engine import OpenAIServing
  18. from aphrodite.engine.protocol import EngineClient
  19. TypeTokenIDs = List[int]
  20. def request_output_to_embedding_response(
  21. final_res_batch: List[EmbeddingRequestOutput], request_id: str,
  22. created_time: int, model_name: str,
  23. encoding_format: str) -> EmbeddingResponse:
  24. data: List[EmbeddingResponseData] = []
  25. num_prompt_tokens = 0
  26. for idx, final_res in enumerate(final_res_batch):
  27. prompt_token_ids = final_res.prompt_token_ids
  28. embedding = final_res.outputs.embedding
  29. if encoding_format == "base64":
  30. # Force to use float32 for base64 encoding
  31. # to match the OpenAI python client behavior
  32. embedding_bytes = np.array(embedding, dtype="float32").tobytes()
  33. embedding = base64.b64encode(embedding_bytes).decode("utf-8")
  34. embedding_data = EmbeddingResponseData(index=idx, embedding=embedding)
  35. data.append(embedding_data)
  36. num_prompt_tokens += len(prompt_token_ids)
  37. usage = UsageInfo(
  38. prompt_tokens=num_prompt_tokens,
  39. total_tokens=num_prompt_tokens,
  40. )
  41. return EmbeddingResponse(
  42. id=request_id,
  43. created=created_time,
  44. model=model_name,
  45. data=data,
  46. usage=usage,
  47. )
  48. class OpenAIServingEmbedding(OpenAIServing):
  49. def __init__(
  50. self,
  51. engine_client: EngineClient,
  52. model_config: ModelConfig,
  53. served_model_names: List[str],
  54. *,
  55. request_logger: Optional[RequestLogger],
  56. ):
  57. super().__init__(engine_client=engine_client,
  58. model_config=model_config,
  59. served_model_names=served_model_names,
  60. lora_modules=None,
  61. prompt_adapters=None,
  62. request_logger=request_logger)
  63. self._enabled = self._check_embedding_mode(model_config.embedding_mode)
  64. async def create_embedding(
  65. self,
  66. request: EmbeddingRequest,
  67. raw_request: Optional[Request] = None
  68. ) -> Union[ErrorResponse, EmbeddingResponse]:
  69. """Completion API similar to OpenAI's API.
  70. See https://platform.openai.com/docs/api-reference/embeddings/create
  71. for the API specification. This API mimics the OpenAI Embedding API.
  72. """
  73. if not self._enabled:
  74. return self.create_error_response("Embedding API disabled")
  75. error_check_ret = await self._check_model(request)
  76. if error_check_ret is not None:
  77. return error_check_ret
  78. encoding_format = (request.encoding_format
  79. if request.encoding_format else "float")
  80. if request.dimensions is not None:
  81. return self.create_error_response(
  82. "dimensions is currently not supported")
  83. model_name = request.model
  84. request_id = f"embd-{random_uuid()}"
  85. created_time = int(time.monotonic())
  86. # Schedule the request and get the result generator.
  87. generators: List[AsyncGenerator[EmbeddingRequestOutput, None]] = []
  88. try:
  89. (
  90. lora_request,
  91. prompt_adapter_request,
  92. ) = self._maybe_get_adapters(request)
  93. tokenizer = await self.engine_client.get_tokenizer(lora_request)
  94. pooling_params = request.to_pooling_params()
  95. prompts = list(
  96. self._tokenize_prompt_input_or_inputs(
  97. request,
  98. tokenizer,
  99. request.input,
  100. ))
  101. for i, prompt_inputs in enumerate(prompts):
  102. request_id_item = f"{request_id}-{i}"
  103. self._log_inputs(request_id_item,
  104. prompt_inputs,
  105. params=pooling_params,
  106. lora_request=lora_request,
  107. prompt_adapter_request=prompt_adapter_request)
  108. if prompt_adapter_request is not None:
  109. raise NotImplementedError(
  110. "Prompt adapter is not supported "
  111. "for embedding models")
  112. generator = self.engine_client.encode(
  113. {"prompt_token_ids": prompt_inputs["prompt_token_ids"]},
  114. pooling_params,
  115. request_id_item,
  116. lora_request=lora_request,
  117. )
  118. generators.append(generator)
  119. except ValueError as e:
  120. # TODO: Use an aphrodite-specific Validation Error
  121. return self.create_error_response(str(e))
  122. result_generator: AsyncIterator[Tuple[
  123. int, EmbeddingRequestOutput]] = merge_async_iterators(
  124. *generators,
  125. is_cancelled=raw_request.is_disconnected
  126. if raw_request else None)
  127. # Non-streaming response
  128. final_res_batch: List[Optional[EmbeddingRequestOutput]]
  129. final_res_batch = [None] * len(prompts)
  130. try:
  131. async for i, res in result_generator:
  132. final_res_batch[i] = res
  133. for final_res in final_res_batch:
  134. assert final_res is not None
  135. final_res_batch_checked = cast(List[EmbeddingRequestOutput],
  136. final_res_batch)
  137. response = request_output_to_embedding_response(
  138. final_res_batch_checked, request_id, created_time, model_name,
  139. encoding_format)
  140. except asyncio.CancelledError:
  141. return self.create_error_response("Client disconnected")
  142. except ValueError as e:
  143. # TODO: Use an aphrodite-specific Validation Error
  144. return self.create_error_response(str(e))
  145. return response
  146. def _check_embedding_mode(self, embedding_mode: bool):
  147. if not embedding_mode:
  148. logger.warning(
  149. "embedding_mode is False. Embedding API will not work.")
  150. else:
  151. logger.info("Activating the server engine with embedding enabled.")
  152. return embedding_mode