浏览代码

api: support LoRA lineage and base model metadata management (#1072)

* api: support LoRA lineage and base model metadata management

* some f-string fixes
AlpinDale 2 月之前
父节点
当前提交
6212072245

+ 1 - 1
aphrodite/endpoints/chat_utils.py

@@ -325,7 +325,7 @@ def load_chat_template(
         # ensure we decode so our escape are interpreted correctly
         resolved_chat_template = codecs.decode(chat_template, "unicode_escape")
 
-    logger.info("Using supplied chat template:\n%s", resolved_chat_template)
+    logger.info(f"Using supplied chat template:\n{resolved_chat_template}")
     return resolved_chat_template
 
 

+ 11 - 5
aphrodite/endpoints/openai/api_server.py

@@ -48,7 +48,8 @@ from aphrodite.endpoints.openai.serving_chat import OpenAIServingChat
 from aphrodite.endpoints.openai.serving_completions import (
     OpenAIServingCompletion)
 from aphrodite.endpoints.openai.serving_embedding import OpenAIServingEmbedding
-from aphrodite.endpoints.openai.serving_engine import (LoRAModulePath,
+from aphrodite.endpoints.openai.serving_engine import (BaseModelPath,
+                                                       LoRAModulePath,
                                                        PromptAdapterPath)
 from aphrodite.endpoints.openai.serving_tokenization import (
     OpenAIServingTokenization)
@@ -1116,6 +1117,11 @@ def init_app_state(
     else:
         request_logger = RequestLogger(max_log_len=args.max_log_len)
 
+    base_model_paths = [
+        BaseModelPath(name=name, model_path=args.model)
+        for name in served_model_names
+    ]
+
     state.engine_client = engine_client
     state.log_stats = not args.disable_log_stats
     state.current_model = args.model
@@ -1123,7 +1129,7 @@ def init_app_state(
     state.openai_serving_chat = OpenAIServingChat(
         engine_client,
         model_config,
-        served_model_names,
+        base_model_paths,
         args.response_role,
         lora_modules=args.lora_modules,
         prompt_adapters=args.prompt_adapters,
@@ -1136,7 +1142,7 @@ def init_app_state(
     state.openai_serving_completion = OpenAIServingCompletion(
         engine_client,
         model_config,
-        served_model_names,
+        base_model_paths,
         lora_modules=args.lora_modules,
         prompt_adapters=args.prompt_adapters,
         request_logger=request_logger,
@@ -1145,13 +1151,13 @@ def init_app_state(
     state.openai_serving_embedding = OpenAIServingEmbedding(
         engine_client,
         model_config,
-        served_model_names,
+        base_model_paths,
         request_logger=request_logger,
     )
     state.openai_serving_tokenization = OpenAIServingTokenization(
         engine_client,
         model_config,
-        served_model_names,
+        base_model_paths,
         lora_modules=args.lora_modules,
         request_logger=request_logger,
         chat_template=args.chat_template,

+ 23 - 4
aphrodite/endpoints/openai/args.py

@@ -19,8 +19,23 @@ class LoRAParserAction(argparse.Action):
     def __call__(self, parser, namespace, values, option_string=None):
         lora_list = []
         for item in values:
-            name, path = item.split('=')
-            lora_list.append(LoRAModulePath(name, path))
+            if item in [None, '']:  # Skip if item is None or empty string
+                continue
+            if '=' in item and ',' not in item:  # Old format: name=path
+                name, path = item.split('=')
+                lora_list.append(LoRAModulePath(name, path))
+            else:  # Assume JSON format
+                try:
+                    lora_dict = json.loads(item)
+                    lora = LoRAModulePath(**lora_dict)
+                    lora_list.append(lora)
+                except json.JSONDecodeError:
+                    parser.error(
+                        f"Invalid JSON format for --lora-modules: {item}")
+                except TypeError as e:
+                    parser.error(
+                        f"Invalid fields for --lora-modules: {item} - {str(e)}"
+                    )
         setattr(namespace, self.dest, lora_list)
 
 
@@ -74,8 +89,12 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         default=None,
         nargs='+',
         action=LoRAParserAction,
-        help="LoRA module configurations in the format name=path. "
-        "Multiple modules can be specified.")
+        help="LoRA module configurations in either 'name=path' format"
+        "or JSON format. "
+        "Example (old format): 'name=path' "
+        "Example (new format): "
+        "'{\"name\": \"name\", \"local_path\": \"path\", "
+        "\"base_model_name\": \"id\"}'")
     parser.add_argument(
         "--prompt-adapters",
         type=str,

+ 126 - 24
aphrodite/endpoints/openai/run_batch.py

@@ -1,20 +1,26 @@
 import asyncio
+from http import HTTPStatus
 from io import StringIO
-from typing import Awaitable, Callable, List
+from typing import Awaitable, Callable, List, Optional
 
 import aiohttp
-from loguru import logger
+import torch
+from prometheus_client import start_http_server
+from tqdm import tqdm
 
 from aphrodite.common.utils import FlexibleArgumentParser, random_uuid
-from aphrodite.endpoints.logger import RequestLogger
+from aphrodite.endpoints.logger import RequestLogger, logger
+# yapf: disable
 from aphrodite.endpoints.openai.protocol import (BatchRequestInput,
                                                  BatchRequestOutput,
                                                  BatchResponseData,
                                                  ChatCompletionResponse,
                                                  EmbeddingResponse,
                                                  ErrorResponse)
+# yapf: enable
 from aphrodite.endpoints.openai.serving_chat import OpenAIServingChat
 from aphrodite.endpoints.openai.serving_embedding import OpenAIServingEmbedding
+from aphrodite.endpoints.openai.serving_engine import BaseModelPath
 from aphrodite.engine.args_tools import AsyncEngineArgs
 from aphrodite.engine.async_aphrodite import AsyncAphrodite
 from aphrodite.version import __version__ as APHRODITE_VERSION
@@ -44,18 +50,70 @@ def parse_args():
                         type=str,
                         default="assistant",
                         help="The role name to return if "
-                        "`request.add_generation_prompt=true`.")
-    parser.add_argument("--max-log-len",
-                        type=int,
-                        default=0,
-                        help="Max number of prompt characters or prompt "
-                        "ID numbers being printed in log."
-                        "\n\nDefault: 0")
+                        "`request.add_generation_prompt=True`.")
 
     parser = AsyncEngineArgs.add_cli_args(parser)
+
+    parser.add_argument('--max-log-len',
+                        type=int,
+                        default=None,
+                        help='Max number of prompt characters or prompt '
+                        'ID numbers being printed in log.'
+                        '\n\nDefault: Unlimited')
+
+    parser.add_argument("--enable-metrics",
+                        action="store_true",
+                        help="Enable Prometheus metrics")
+    parser.add_argument(
+        "--url",
+        type=str,
+        default="0.0.0.0",
+        help="URL to the Prometheus metrics server "
+        "(only needed if enable-metrics is set).",
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=8000,
+        help="Port number for the Prometheus metrics server "
+        "(only needed if enable-metrics is set).",
+    )
+
     return parser.parse_args()
 
 
+# explicitly use pure text format, with a newline at the end
+# this makes it impossible to see the animation in the progress bar
+# but will avoid messing up with ray or multiprocessing, which wraps
+# each line of output with some prefix.
+_BAR_FORMAT = "{desc}: {percentage:3.0f}% Completed | {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]\n"  # noqa: E501
+
+
+class BatchProgressTracker:
+
+    def __init__(self):
+        self._total = 0
+        self._pbar: Optional[tqdm] = None
+
+    def submitted(self):
+        self._total += 1
+
+    def completed(self):
+        if self._pbar:
+            self._pbar.update()
+
+    def pbar(self) -> tqdm:
+        enable_tqdm = not torch.distributed.is_initialized(
+        ) or torch.distributed.get_rank() == 0
+        self._pbar = tqdm(total=self._total,
+                          unit="req",
+                          desc="Running batch",
+                          mininterval=5,
+                          disable=not enable_tqdm,
+                          bar_format=_BAR_FORMAT)
+        return self._pbar
+
+
 async def read_file(path_or_url: str) -> str:
     if path_or_url.startswith("http://") or path_or_url.startswith("https://"):
         async with aiohttp.ClientSession() as session, \
@@ -79,8 +137,28 @@ async def write_file(path_or_url: str, data: str) -> None:
             f.write(data)
 
 
+def make_error_request_output(request: BatchRequestInput,
+                              error_msg: str) -> BatchRequestOutput:
+    batch_output = BatchRequestOutput(
+        id=f"aphrodite-{random_uuid()}",
+        custom_id=request.custom_id,
+        response=BatchResponseData(
+            status_code=HTTPStatus.BAD_REQUEST,
+            request_id=f"aphrodite-batch-{random_uuid()}",
+        ),
+        error=error_msg,
+    )
+    return batch_output
+
+
+async def make_async_error_request_output(
+        request: BatchRequestInput, error_msg: str) -> BatchRequestOutput:
+    return make_error_request_output(request, error_msg)
+
+
 async def run_request(serving_engine_func: Callable,
-                      request: BatchRequestInput) -> BatchRequestOutput:
+                      request: BatchRequestInput,
+                      tracker: BatchProgressTracker) -> BatchRequestOutput:
     response = await serving_engine_func(request.body)
 
     if isinstance(response, (ChatCompletionResponse, EmbeddingResponse)):
@@ -101,8 +179,10 @@ async def run_request(serving_engine_func: Callable,
             error=response,
         )
     else:
-        raise ValueError("Request must not be sent in stream mode")
+        batch_output = make_error_request_output(
+            request, error_msg="Request must not be sent in stream mode")
 
+    tracker.completed()
     return batch_output
 
 
@@ -115,19 +195,22 @@ async def main(args):
     engine_args = AsyncEngineArgs.from_cli_args(args)
     engine = AsyncAphrodite.from_engine_args(engine_args)
 
-    # When using single Aphrodite without engine_use_ray
     model_config = await engine.get_model_config()
+    base_model_paths = [
+        BaseModelPath(name=name, model_path=args.model)
+        for name in served_model_names
+    ]
 
     if args.disable_log_requests:
         request_logger = None
     else:
         request_logger = RequestLogger(max_log_len=args.max_log_len)
 
-    # Create the OpenAI serving objects.
+    # Create the openai serving objects.
     openai_serving_chat = OpenAIServingChat(
         engine,
         model_config,
-        served_model_names,
+        base_model_paths,
         args.response_role,
         lora_modules=None,
         prompt_adapters=None,
@@ -137,10 +220,13 @@ async def main(args):
     openai_serving_embedding = OpenAIServingEmbedding(
         engine,
         model_config,
-        served_model_names,
+        base_model_paths,
         request_logger=request_logger,
     )
 
+    tracker = BatchProgressTracker()
+    logger.info(f"Reading batch from {args.input_file}...")
+
     # Submit all requests in the file to the engine "concurrently".
     response_futures: List[Awaitable[BatchRequestOutput]] = []
     for request_json in (await read_file(args.input_file)).strip().split("\n"):
@@ -148,22 +234,30 @@ async def main(args):
         request_json = request_json.strip()
         if not request_json:
             continue
+
         request = BatchRequestInput.model_validate_json(request_json)
 
         # Determine the type of request and run it.
         if request.url == "/v1/chat/completions":
             response_futures.append(
                 run_request(openai_serving_chat.create_chat_completion,
-                            request))
+                            request, tracker))
+            tracker.submitted()
         elif request.url == "/v1/embeddings":
             response_futures.append(
-                run_request(openai_serving_embedding.create_embedding,
-                            request))
+                run_request(openai_serving_embedding.create_embedding, request,
+                            tracker))
+            tracker.submitted()
         else:
-            raise ValueError("Only /v1/chat/completions and /v1/embeddings are"
-                             "supported in the batch endpoint.")
+            response_futures.append(
+                make_async_error_request_output(
+                    request,
+                    error_msg="Only /v1/chat/completions and "
+                    "/v1/embeddings are supported in the batch endpoint.",
+                ))
 
-    responses = await asyncio.gather(*response_futures)
+    with tracker.pbar():
+        responses = await asyncio.gather(*response_futures)
 
     output_buffer = StringIO()
     for response in responses:
@@ -176,7 +270,15 @@ async def main(args):
 if __name__ == "__main__":
     args = parse_args()
 
-    logger.info(f"Aphrodite API server version {APHRODITE_VERSION}")
-    logger.info(f"args: {args}")
+    logger.info(f"Aphrodite batch processing API version {APHRODITE_VERSION}")
+    logger.debug(f"args: {args}")
+
+    # Start the Prometheus metrics server. LLMEngine uses the Prometheus client
+    # to publish metrics at the /metrics endpoint.
+    if args.enable_metrics:
+        logger.info("Prometheus metrics enabled")
+        start_http_server(port=args.port, addr=args.url)
+    else:
+        logger.info("Prometheus metrics disabled")
 
     asyncio.run(main(args))

+ 6 - 5
aphrodite/endpoints/openai/serving_chat.py

@@ -26,7 +26,8 @@ from aphrodite.endpoints.openai.protocol import (
     ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice,
     ChatCompletionStreamResponse, ChatMessage, DeltaFunctionCall, DeltaMessage,
     DeltaToolCall, ErrorResponse, FunctionCall, ToolCall, UsageInfo)
-from aphrodite.endpoints.openai.serving_engine import (LoRAModulePath,
+from aphrodite.endpoints.openai.serving_engine import (BaseModelPath,
+                                                       LoRAModulePath,
                                                        OpenAIServing,
                                                        PromptAdapterPath,
                                                        TextTokensPrompt)
@@ -44,7 +45,7 @@ class OpenAIServingChat(OpenAIServing):
     def __init__(self,
                  engine_client: EngineClient,
                  model_config: ModelConfig,
-                 served_model_names: List[str],
+                 base_model_paths: List[BaseModelPath],
                  response_role: str,
                  *,
                  lora_modules: Optional[List[LoRAModulePath]],
@@ -56,7 +57,7 @@ class OpenAIServingChat(OpenAIServing):
                  tool_parser: Optional[str] = None):
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
-                         served_model_names=served_model_names,
+                         base_model_paths=base_model_paths,
                          lora_modules=lora_modules,
                          prompt_adapters=prompt_adapters,
                          request_logger=request_logger,
@@ -249,7 +250,7 @@ class OpenAIServingChat(OpenAIServing):
         conversation: List[ConversationMessage],
         tokenizer: AnyTokenizer,
     ) -> AsyncGenerator[str, None]:
-        model_name = self.served_model_names[0]
+        model_name = self.base_model_paths[0].name
         created_time = int(time.time())
         chunk_object_type: Final = "chat.completion.chunk"
         first_iteration = True
@@ -583,7 +584,7 @@ class OpenAIServingChat(OpenAIServing):
         tokenizer: AnyTokenizer,
     ) -> Union[ErrorResponse, ChatCompletionResponse]:
 
-        model_name = self.served_model_names[0]
+        model_name = self.base_model_paths[0].name
         created_time = int(time.time())
         final_res: Optional[RequestOutput] = None
 

+ 5 - 4
aphrodite/endpoints/openai/serving_completions.py

@@ -16,7 +16,8 @@ from aphrodite.endpoints.openai.protocol import (
     CompletionLogProbs, CompletionRequest, CompletionResponse,
     CompletionResponseChoice, CompletionResponseStreamChoice,
     CompletionStreamResponse, ErrorResponse, UsageInfo)
-from aphrodite.endpoints.openai.serving_engine import (LoRAModulePath,
+from aphrodite.endpoints.openai.serving_engine import (BaseModelPath,
+                                                       LoRAModulePath,
                                                        OpenAIServing,
                                                        PromptAdapterPath)
 from aphrodite.engine.protocol import EngineClient
@@ -34,7 +35,7 @@ class OpenAIServingCompletion(OpenAIServing):
         self,
         engine_client: EngineClient,
         model_config: ModelConfig,
-        served_model_names: List[str],
+        base_model_paths: List[BaseModelPath],
         *,
         lora_modules: Optional[List[LoRAModulePath]],
         prompt_adapters: Optional[List[PromptAdapterPath]],
@@ -43,7 +44,7 @@ class OpenAIServingCompletion(OpenAIServing):
     ):
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
-                         served_model_names=served_model_names,
+                         base_model_paths=base_model_paths,
                          lora_modules=lora_modules,
                          prompt_adapters=prompt_adapters,
                          request_logger=request_logger,
@@ -78,7 +79,7 @@ class OpenAIServingCompletion(OpenAIServing):
             return self.create_error_response(
                 "suffix is not currently supported")
 
-        model_name = self.served_model_names[0]
+        model_name = self.base_model_paths[0].name
         request_id = f"cmpl-{random_uuid()}"
         created_time = int(time.time())
 

+ 4 - 3
aphrodite/endpoints/openai/serving_embedding.py

@@ -16,7 +16,8 @@ from aphrodite.endpoints.openai.protocol import (EmbeddingRequest,
                                                  EmbeddingResponse,
                                                  EmbeddingResponseData,
                                                  ErrorResponse, UsageInfo)
-from aphrodite.endpoints.openai.serving_engine import OpenAIServing
+from aphrodite.endpoints.openai.serving_engine import (BaseModelPath,
+                                                       OpenAIServing)
 from aphrodite.engine.protocol import EngineClient
 
 TypeTokenIDs = List[int]
@@ -61,13 +62,13 @@ class OpenAIServingEmbedding(OpenAIServing):
         self,
         engine_client: EngineClient,
         model_config: ModelConfig,
-        served_model_names: List[str],
+        base_model_paths: List[BaseModelPath],
         *,
         request_logger: Optional[RequestLogger],
     ):
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
-                         served_model_names=served_model_names,
+                         base_model_paths=base_model_paths,
                          lora_modules=None,
                          prompt_adapters=None,
                          request_logger=request_logger)

+ 29 - 14
aphrodite/endpoints/openai/serving_engine.py

@@ -35,6 +35,12 @@ from aphrodite.modeling.guided_decoding import (
 from aphrodite.prompt_adapter.request import PromptAdapterRequest
 
 
+@dataclass
+class BaseModelPath:
+    name: str
+    model_path: str
+
+
 @dataclass
 class PromptAdapterPath:
     name: str
@@ -45,6 +51,7 @@ class PromptAdapterPath:
 class LoRAModulePath:
     name: str
     path: str
+    base_model_name: Optional[str] = None
 
 
 AnyRequest = Union[ChatCompletionRequest, CompletionRequest, DetokenizeRequest,
@@ -64,7 +71,7 @@ class OpenAIServing:
         self,
         engine_client: EngineClient,
         model_config: ModelConfig,
-        served_model_names: List[str],
+        base_model_paths: List[BaseModelPath],
         *,
         lora_modules: Optional[List[LoRAModulePath]],
         prompt_adapters: Optional[List[PromptAdapterPath]],
@@ -77,16 +84,19 @@ class OpenAIServing:
         self.model_config = model_config
         self.max_model_len = model_config.max_model_len
 
-        self.served_model_names = served_model_names
+        self.base_model_paths = base_model_paths
 
         self.lora_requests = []
         if lora_modules is not None:
             self.lora_requests = [
-                LoRARequest(
-                    lora_name=lora.name,
-                    lora_int_id=i,
-                    lora_path=lora.path,
-                ) for i, lora in enumerate(lora_modules, start=1)
+                LoRARequest(lora_name=lora.name,
+                            lora_int_id=i,
+                            lora_path=lora.path,
+                            base_model_name=lora.base_model_name
+                            if lora.base_model_name
+                            and self._is_model_supported(lora.base_model_name)
+                            else self.base_model_paths[0].name)
+                for i, lora in enumerate(lora_modules, start=1)
             ]
 
         self.prompt_adapter_requests = []
@@ -109,21 +119,23 @@ class OpenAIServing:
     async def show_available_models(self) -> ModelList:
         """Show available models. Right now we only have one model."""
         model_cards = [
-            ModelCard(id=served_model_name,
+            ModelCard(id=base_model.name,
                       max_model_len=self.max_model_len,
-                      root=self.served_model_names[0],
+                      root=base_model.model_path,
                       permission=[ModelPermission()])
-            for served_model_name in self.served_model_names
+            for base_model in self.base_model_paths
         ]
         lora_cards = [
             ModelCard(id=lora.lora_name,
-                      root=self.served_model_names[0],
+                      root=lora.local_path,
+                      parent=lora.base_model_name if lora.base_model_name else
+                      self.base_model_paths[0].name,
                       permission=[ModelPermission()])
             for lora in self.lora_requests
         ]
         prompt_adapter_cards = [
             ModelCard(id=prompt_adapter.prompt_adapter_name,
-                      root=self.served_model_names[0],
+                      root=self.base_model_paths[0].name,
                       permission=[ModelPermission()])
             for prompt_adapter in self.prompt_adapter_requests
         ]
@@ -168,7 +180,7 @@ class OpenAIServing:
     ) -> Optional[ErrorResponse]:
         # only check these if it's not a Tokenizer/Detokenize Request
         if not isinstance(request, (TokenizeRequest, DetokenizeRequest)):
-            if request.model in self.served_model_names:
+            if self._is_model_supported(request.model):
                 return None
             if request.model in [
                     lora.lora_name for lora in self.lora_requests
@@ -188,7 +200,7 @@ class OpenAIServing:
         self, request: AnyRequest
     ) -> Union[Tuple[None, None], Tuple[LoRARequest, None], Tuple[
             None, PromptAdapterRequest]]:
-        if request.model in self.served_model_names:
+        if self._is_model_supported(request.model):
             return None, None
         for lora in self.lora_requests:
             if request.model == lora.lora_name:
@@ -462,3 +474,6 @@ class OpenAIServing:
         if logprob.decoded_token is not None:
             return logprob.decoded_token
         return tokenizer.decode(token_id)
+
+    def _is_model_supported(self, model_name):
+        return any(model.name == model_name for model in self.base_model_paths)

+ 4 - 3
aphrodite/endpoints/openai/serving_tokenization.py

@@ -18,7 +18,8 @@ from aphrodite.endpoints.openai.protocol import (DetokenizeRequest,
                                                  TokenizeRequest,
                                                  TokenizeResponse)
 # yapf: enable
-from aphrodite.endpoints.openai.serving_engine import (LoRAModulePath,
+from aphrodite.endpoints.openai.serving_engine import (BaseModelPath,
+                                                       LoRAModulePath,
                                                        OpenAIServing)
 from aphrodite.engine.protocol import EngineClient
 from aphrodite.transformers_utils.tokenizer import MistralTokenizer
@@ -30,7 +31,7 @@ class OpenAIServingTokenization(OpenAIServing):
         self,
         engine_client: EngineClient,
         model_config: ModelConfig,
-        served_model_names: List[str],
+        base_model_paths: List[BaseModelPath],
         *,
         lora_modules: Optional[List[LoRAModulePath]],
         request_logger: Optional[RequestLogger],
@@ -38,7 +39,7 @@ class OpenAIServingTokenization(OpenAIServing):
     ):
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
-                         served_model_names=served_model_names,
+                         base_model_paths=base_model_paths,
                          lora_modules=lora_modules,
                          prompt_adapters=None,
                          request_logger=request_logger)

+ 1 - 1
aphrodite/executor/cpu_executor.py

@@ -208,7 +208,7 @@ class CPUExecutor(ExecutorBase):
         # NOTE: `cpu block` for CPU backend is located on CPU memory but is
         # referred as `gpu block`. Because we want to reuse the existing block
         # management procedure.
-        logger.info("# CPU blocks: %d", num_gpu_blocks)
+        logger.info(f"# CPU blocks: {num_gpu_blocks}")
 
         self._run_workers("initialize_cache",
                           num_gpu_blocks=num_gpu_blocks,

+ 2 - 2
aphrodite/executor/ray_tpu_executor.py

@@ -287,8 +287,8 @@ class RayTPUExecutor(TPUExecutor):
 
     def initialize_cache(self, num_gpu_blocks: int,
                          num_cpu_blocks: int) -> None:
-        logger.info("# TPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
-                    num_cpu_blocks)
+        logger.info(f"# TPU blocks: {num_gpu_blocks}, "
+                    f"# CPU blocks: {num_cpu_blocks}")
         self.cache_config.num_gpu_blocks = num_gpu_blocks
         self.cache_config.num_cpu_blocks = num_cpu_blocks
         self._run_workers("initialize_cache",

+ 2 - 2
aphrodite/executor/ray_utils.py

@@ -185,8 +185,8 @@ def _wait_until_pg_removed(current_placement_group: "PlacementGroup"):
         # Exponential backoff for warning print.
         wait_interval *= 2
         logger.info(
-            "Waiting for removing a placement group of specs for "
-            "%d seconds.", int(time.time() - s))
+            f"Waiting for removing a placement group of specs for "
+            f"{int(time.time() - s)} seconds.")
         time.sleep(wait_interval)
 
 

+ 2 - 2
aphrodite/lora/models.py

@@ -368,8 +368,8 @@ class LoRAModelManager(AdapterModelManager):
         index, _ = first_free_slot
         self._active_adapters[lora_id] = None
         lora_model = self._registered_adapters[lora_id]
-        logger.debug("Activating LoRA. int id: %d, slot index: %d",
-                     lora_model.id, index)
+        logger.debug(f"Activating LoRA. int id: {lora_model.id}, "
+                     f"slot index: {index}")
         self.lora_index_to_id[index] = lora_model.id
         for module_name, module in self.modules.items():
             module_lora = lora_model.get_lora(module_name)

+ 1 - 0
aphrodite/lora/request.py

@@ -28,6 +28,7 @@ class LoRARequest(
     lora_path: str = ""
     lora_local_path: Optional[str] = msgspec.field(default=None)
     long_lora_max_len: Optional[int] = None
+    base_model_name: Optional[str] = msgspec.field(default=None)
     __hash__ = AdapterRequest.__hash__
 
     def __post_init__(self):

+ 4 - 3
aphrodite/modeling/models/qwen.py

@@ -703,9 +703,10 @@ def input_processor_for_qwen(ctx: InputContext,
 
     if num_matched_images != num_images:
         logger.warning(
-            "Number of matched image placeholders %s doesn't match the number "
-            "of expected images %s; check your placeholder formatting.",
-            num_matched_images, num_images)
+            f"Number of matched image placeholders {num_matched_images} "
+            f"doesn't match the number of expected images {num_images}; "
+            "check your placeholder formatting."
+        )
 
     new_prompt_token_ids = tokenizer.encode(new_prompt)
 

+ 1 - 1
aphrodite/modeling/models/qwen2_vl.py

@@ -593,7 +593,7 @@ def mm_input_mapper_for_qwen2_vl(
             images=images, videos=videos, return_tensors="pt"
         ).data
     except Exception:
-        logger.error("Failed to process image (%s)", data)
+        logger.error(f"Failed to process image ({data})")
         raise
     return MultiModalInputs(batch_data)
 

+ 1 - 1
aphrodite/multimodal/video.py

@@ -55,7 +55,7 @@ class VideoPlugin(ImagePlugin):
             try:
                 batch_data = video_processor(data, return_tensors="pt").data
             except Exception:
-                logger.error("Failed to process image (%s)", data)
+                logger.error(f"Failed to process image ({data})")
                 raise
             return MultiModalInputs(batch_data)
         elif is_list_of(data, np.ndarray):

+ 4 - 4
aphrodite/worker/model_runner_base.py

@@ -113,8 +113,8 @@ def dump_input_when_exception(exclude_args: Optional[List[int]] = None,
             except Exception as err:
                 timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
                 filename = f"/tmp/err_{func.__name__}_input_{timestamp}.pkl"
-                logger.info("Writing input of failed execution to %s...",
-                            filename)
+                logger.info("Writing input of failed execution to "
+                            f"{filename}...")
                 with open(filename, "wb") as filep:
                     dumped_inputs = {
                         k: v
@@ -135,8 +135,8 @@ def dump_input_when_exception(exclude_args: Optional[List[int]] = None,
 
                     pickle.dump(dumped_inputs, filep)
                     logger.info(
-                        "Completed writing input of failed execution to %s.",
-                        filename)
+                        f"Completed writing input of failed execution to "
+                        f"{filename}.")
                 raise type(err)(
                     f"Error in model execution (input dumped to {filename}): "
                     f"{str(err)}") from err

+ 103 - 0
tests/endpoints/openai/test_cli_args.py

@@ -0,0 +1,103 @@
+import json
+import unittest
+
+from aphrodite.common.utils import FlexibleArgumentParser
+from aphrodite.endpoints.openai.args import make_arg_parser
+from aphrodite.endpoints.openai.serving_engine import LoRAModulePath
+
+LORA_MODULE = {
+    "name": "module2",
+    "path": "/path/to/module2",
+    "base_model_name": "llama",
+}
+
+
+class TestLoraParserAction(unittest.TestCase):
+    def setUp(self):
+        # Setting up argparse parser for tests
+        parser = FlexibleArgumentParser(
+            description="Aphrodite's remote OpenAI server."
+        )
+        self.parser = make_arg_parser(parser)
+
+    def test_valid_key_value_format(self):
+        # Test old format: name=path
+        args = self.parser.parse_args(
+            [
+                "--lora-modules",
+                "module1=/path/to/module1",
+            ]
+        )
+        expected = [LoRAModulePath(name="module1", path="/path/to/module1")]
+        self.assertEqual(args.lora_modules, expected)
+
+    def test_valid_json_format(self):
+        # Test valid JSON format input
+        args = self.parser.parse_args(
+            [
+                "--lora-modules",
+                json.dumps(LORA_MODULE),
+            ]
+        )
+        expected = [
+            LoRAModulePath(
+                name="module2", path="/path/to/module2", base_model_name="llama"
+            )
+        ]
+        self.assertEqual(args.lora_modules, expected)
+
+    def test_invalid_json_format(self):
+        # Test invalid JSON format input, missing closing brace
+        with self.assertRaises(SystemExit):
+            self.parser.parse_args(
+                [
+                    "--lora-modules",
+                    '{"name": "module3", "path": "/path/to/module3"',
+                ]
+            )
+
+    def test_invalid_type_error(self):
+        # Test type error when values are not JSON or key=value
+        with self.assertRaises(SystemExit):
+            self.parser.parse_args(
+                [
+                    "--lora-modules",
+                    "invalid_format",  # This is not JSON or key=value format
+                ]
+            )
+
+    def test_invalid_json_field(self):
+        # Test valid JSON format but missing required fields
+        with self.assertRaises(SystemExit):
+            self.parser.parse_args(
+                [
+                    "--lora-modules",
+                    '{"name": "module4"}',  # Missing required 'path' field
+                ]
+            )
+
+    def test_empty_values(self):
+        # Test when no LoRA modules are provided
+        args = self.parser.parse_args(["--lora-modules", ""])
+        self.assertEqual(args.lora_modules, [])
+
+    def test_multiple_valid_inputs(self):
+        # Test multiple valid inputs (both old and JSON format)
+        args = self.parser.parse_args(
+            [
+                "--lora-modules",
+                "module1=/path/to/module1",
+                json.dumps(LORA_MODULE),
+            ]
+        )
+        expected = [
+            LoRAModulePath(name="module1", path="/path/to/module1"),
+            LoRAModulePath(
+                name="module2", path="/path/to/module2", base_model_name="llama"
+            ),
+        ]
+        self.assertEqual(args.lora_modules, expected)
+
+
+if __name__ == "__main__":
+    unittest.main()

+ 81 - 0
tests/endpoints/openai/test_lora_lineage.py

@@ -0,0 +1,81 @@
+import json
+
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+# downloading lora to test lora requests
+from huggingface_hub import snapshot_download
+
+from ...utils import RemoteOpenAIServer
+
+# any model with a chat template should work here
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+# technically this needs Mistral-7B-v0.1 as base, but we're not testing
+# generation quality here
+LORA_NAME = "typeof/zephyr-7b-beta-lora"
+
+
+@pytest.fixture(scope="module")
+def zephyr_lora_files():
+    return snapshot_download(repo_id=LORA_NAME)
+
+
+@pytest.fixture(scope="module")
+def server_with_lora_modules_json(zephyr_lora_files):
+    # Define the json format LoRA module configurations
+    lora_module_1 = {
+        "name": "zephyr-lora",
+        "path": zephyr_lora_files,
+        "base_model_name": MODEL_NAME,
+    }
+    lora_module_2 = {
+        "name": "zephyr-lora2",
+        "path": zephyr_lora_files,
+        "base_model_name": MODEL_NAME,
+    }
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        # lora config below
+        "--enable-lora",
+        "--lora-modules",
+        json.dumps(lora_module_1),
+        json.dumps(lora_module_2),
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+        "--max-num-seqs",
+        "64",
+    ]
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client_for_lora_lineage(server_with_lora_modules_json):
+    async with server_with_lora_modules_json.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_check_lora_lineage(
+    client_for_lora_lineage: openai.AsyncOpenAI, zephyr_lora_files
+):
+    models = await client_for_lora_lineage.models.list()
+    models = models.data
+    served_model = models[0]
+    lora_models = models[1:]
+    assert served_model.id == MODEL_NAME
+    assert served_model.root == MODEL_NAME
+    assert served_model.parent is None
+    assert all(
+        lora_model.root == zephyr_lora_files for lora_model in lora_models
+    )
+    assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models)
+    assert lora_models[0].id == "zephyr-lora"
+    assert lora_models[1].id == "zephyr-lora2"

+ 4 - 2
tests/endpoints/openai/test_models.py

@@ -49,12 +49,14 @@ def client(server):
 
 
 @pytest.mark.asyncio
-async def test_check_models(client: openai.AsyncOpenAI):
+async def test_check_models(client: openai.AsyncOpenAI, zephyr_lora_files):
     models = await client.models.list()
     models = models.data
     served_model = models[0]
     lora_models = models[1:]
     assert served_model.id == MODEL_NAME
-    assert all(model.root == MODEL_NAME for model in models)
+    assert served_model.root == MODEL_NAME
+    assert all(lora_model.root == zephyr_lora_files
+               for lora_model in lora_models)
     assert lora_models[0].id == "zephyr-lora"
     assert lora_models[1].id == "zephyr-lora2"

+ 4 - 2
tests/endpoints/openai/test_serving_chat.py

@@ -6,11 +6,13 @@ from unittest.mock import MagicMock
 from aphrodite.common.config import MultiModalConfig
 from aphrodite.endpoints.openai.protocol import ChatCompletionRequest
 from aphrodite.endpoints.openai.serving_chat import OpenAIServingChat
+from aphrodite.endpoints.openai.serving_engine import BaseModelPath
 from aphrodite.engine.async_aphrodite import AsyncAphrodite
 from aphrodite.transformers_utils.tokenizer import get_tokenizer
 
 MODEL_NAME = "openai-community/gpt2"
 CHAT_TEMPLATE = "Dummy chat template for testing {}"
+BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
 
 
 @dataclass
@@ -37,7 +39,7 @@ async def _async_serving_chat_init():
 
     serving_completion = OpenAIServingChat(engine,
                                            model_config,
-                                           served_model_names=[MODEL_NAME],
+                                           BASE_MODEL_PATHS,
                                            response_role="assistant",
                                            chat_template=CHAT_TEMPLATE,
                                            lora_modules=None,
@@ -57,7 +59,7 @@ def test_serving_chat_should_set_correct_max_tokens():
 
     serving_chat = OpenAIServingChat(mock_engine,
                                      MockModelConfig(),
-                                     served_model_names=[MODEL_NAME],
+                                     BASE_MODEL_PATHS,
                                      response_role="assistant",
                                      chat_template=CHAT_TEMPLATE,
                                      lora_modules=None,

+ 6 - 4
tests/prefix_caching/test_disable_sliding_window.py

@@ -27,8 +27,9 @@ def test_disable_sliding_window(model_len_len, ):
     aphrodite_disabled_model.generate("Hi my name is")
     model_config = aphrodite_disabled_model.llm_engine.model_config
     assert model_config.max_model_len == sliding_len, (
-        "Max len expected to equal sliding_len of %s, but got %s", sliding_len,
-        model_config.max_model_len)
+        f"Max len expected to equal sliding_len of {sliding_len}, "
+        f"but got {model_config.max_model_len}"
+    )
 
     del aphrodite_disabled_model
     cleanup()
@@ -37,8 +38,9 @@ def test_disable_sliding_window(model_len_len, ):
     aphrodite_enabled_model.generate("Hi my name is")
     model_config = aphrodite_enabled_model.llm_engine.model_config
     assert model_config.max_model_len == full_len, (
-        "Max len expected to equal full_len of %s, but got %s", full_len,
-        model_config.max_model_len)
+        f"Max len expected to equal full_len of {full_len}, "
+        f"but got {model_config.max_model_len}"
+    )
 
     del aphrodite_enabled_model
     cleanup()