Răsfoiți Sursa

chore: allow multiple served model names

AlpinDale 10 luni în urmă
părinte
comite
8f9cb7235c

+ 6 - 6
aphrodite/endpoints/openai/api_server.py

@@ -357,7 +357,7 @@ async def get_version():
 
 @kai_api.get("/model")
 async def get_model():
-    return JSONResponse({"result": f"aphrodite/{served_model}"})
+    return JSONResponse({"result": f"aphrodite/{served_model_names[0]}"})
 
 
 @kai_api.get("/config/soft_prompts_list")
@@ -509,11 +509,11 @@ def run_server(args):
     logger.debug(f"args: {args}")
 
     global engine, engine_args, openai_serving_chat, openai_serving_completion,\
-        tokenizer, served_model
+        tokenizer, served_model_names
     if args.served_model_name is not None:
-        served_model = args.served_model_name
+        served_model_names = args.served_model_name
     else:
-        served_model = args.model
+        served_model_names = [args.model]
 
     engine_args = AsyncEngineArgs.from_cli_args(args)
     engine = AsyncAphrodite.from_engine_args(engine_args)
@@ -528,12 +528,12 @@ def run_server(args):
     if chat_template is None and tokenizer.chat_template is not None:
         chat_template = tokenizer.chat_template
 
-    openai_serving_chat = OpenAIServingChat(engine, served_model,
+    openai_serving_chat = OpenAIServingChat(engine, served_model_names,
                                             args.response_role,
                                             args.lora_modules,
                                             args.chat_template)
     openai_serving_completion = OpenAIServingCompletion(
-        engine, served_model, args.lora_modules)
+        engine, served_model_names, args.lora_modules)
     engine_model_config = asyncio.run(engine.get_model_config())
 
     if args.launch_kobold_api:

+ 7 - 3
aphrodite/endpoints/openai/args.py

@@ -67,11 +67,15 @@ def make_arg_parser(parser=None):
                         help="The maximum length of the generated response. "
                         "For use with Kobold Horde.")
     parser.add_argument("--served-model-name",
+                        nargs="+",
                         type=str,
                         default=None,
-                        help="The model name used in the API. If not "
-                        "specified, the model name will be the same as "
-                        "the huggingface name.")
+                        help="The model name(s) used in the API. If multiple "
+                        "names are provided, the server will respond to any "
+                        "of the provided names. The model name in the model "
+                        "field of a response will be the first name in this "
+                        "list. If not specified, the model name will be the "
+                        "same as the `--model` argument.")
     parser.add_argument(
         "--lora-modules",
         type=str,

+ 4 - 4
aphrodite/endpoints/openai/serving_chat.py

@@ -28,12 +28,12 @@ class OpenAIServingChat(OpenAIServing):
 
     def __init__(self,
                  engine: AsyncAphrodite,
-                 served_model: str,
+                 served_model_names: List[str],
                  response_role: str,
                  lora_modules: Optional[List[LoRA]] = None,
                  chat_template=None):
         super().__init__(engine=engine,
-                         served_model=served_model,
+                         served_model_names=served_model_names,
                          lora_modules=lora_modules)
         self.response_role = response_role
         self._load_chat_template(chat_template)
@@ -109,7 +109,7 @@ class OpenAIServingChat(OpenAIServing):
             result_generator: AsyncIterator[RequestOutput], request_id: str
     ) -> Union[ErrorResponse, AsyncGenerator[str, None]]:
 
-        model_name = request.model
+        model_name = self.served_model_names[0]
         created_time = int(time.time())
         chunk_object_type = "chat.completion.chunk"
         first_iteration = True
@@ -251,7 +251,7 @@ class OpenAIServingChat(OpenAIServing):
             result_generator: AsyncIterator[RequestOutput],
             request_id: str) -> Union[ErrorResponse, ChatCompletionResponse]:
 
-        model_name = request.model
+        model_name = self.served_model_names[0]
         created_time = int(time.time())
         final_res: RequestOutput = None
 

+ 3 - 3
aphrodite/endpoints/openai/serving_completions.py

@@ -60,10 +60,10 @@ class OpenAIServingCompletion(OpenAIServing):
 
     def __init__(self,
                  engine: AsyncAphrodite,
-                 served_model: str,
+                 served_model_names: List[str],
                  lora_modules: Optional[List[LoRA]] = None):
         super().__init__(engine=engine,
-                         served_model=served_model,
+                         served_model_names=served_model_names,
                          lora_modules=lora_modules)
 
     async def create_completion(self, request: CompletionRequest,
@@ -86,7 +86,7 @@ class OpenAIServingCompletion(OpenAIServing):
             return self.create_error_response(
                 "suffix is not currently supported")
 
-        model_name = request.model
+        model_name = self.served_model_names[0]
         request_id = f"cmpl-{random_uuid()}"
         created_time = int(time.time())
 

+ 8 - 7
aphrodite/endpoints/openai/serving_engine.py

@@ -33,10 +33,10 @@ class OpenAIServing:
 
     def __init__(self,
                  engine: AsyncAphrodite,
-                 served_model: str,
+                 served_model_names: List[str],
                  lora_modules=Optional[List[LoRA]]):
         self.engine = engine
-        self.served_model = served_model
+        self.served_model_names = served_model_names
         if lora_modules is None:
             self.lora_requests = []
         else:
@@ -79,13 +79,14 @@ class OpenAIServing:
     async def show_available_models(self) -> ModelList:
         """Show available models. Right now we only have one model."""
         model_cards = [
-            ModelCard(id=self.served_model,
-                      root=self.served_model,
+            ModelCard(id=served_model_name,
+                      root=self.served_model_names[0],
                       permission=[ModelPermission()])
+            for served_model_name in self.served_model_names
         ]
         lora_cards = [
             ModelCard(id=lora.lora_name,
-                      root=self.served_model,
+                      root=self.served_model_names[0],
                       permission=[ModelPermission()])
             for lora in self.lora_requests
         ]
@@ -175,7 +176,7 @@ class OpenAIServing:
         return json_str
 
     async def _check_model(self, request) -> Optional[ErrorResponse]:
-        if request.model == self.served_model:
+        if request.model in self.served_model_names:
             return
         if request.model in [lora.lora_name for lora in self.lora_requests]:
             return
@@ -203,7 +204,7 @@ class OpenAIServing:
         ]
 
     def _maybe_get_lora(self, request) -> Optional[LoRARequest]:
-        if request.model == self.served_model:
+        if request.model in self.served_model_names:
             return
         for lora in self.lora_requests:
             if request.model == lora.lora_name:

+ 1 - 1
aphrodite/engine/metrics.py

@@ -163,7 +163,7 @@ class StatLogger:
 
     def __init__(self, local_interval: float, labels: Dict[str, str]) -> None:
         # Metadata for logging locally.
-        self.last_local_log = time.time()
+        self.last_local_log = time.monotonic()
         self.local_interval = local_interval
 
         # Tracked stats over current local logging interval.