7 months ago · f91991f584
--- a/aphrodite/distributed/device_communicators/custom_all_reduce.py
+++ b/aphrodite/distributed/device_communicators/custom_all_reduce.py
@@ -79,9 +79,10 @@ class CustomAllreduce:
 
				         if world_size not in CustomAllreduce._SUPPORTED_WORLD_SIZES:
			
 
				             logger.warning(
			
 
				                 "Custom allreduce is disabled due to an unsupported world"
			
 
				-                " size: %d. Supported world sizes: %s. To silence this "
			
 
				-                "warning, specify disable_custom_all_reduce=True explicitly.",
			
 
				-                world_size, str(CustomAllreduce._SUPPORTED_WORLD_SIZES))
			
 
				+                f" size: {world_size}. Supported world sizes:"
			
 
				+                f"{str(CustomAllreduce._SUPPORTED_WORLD_SIZES)}. To silence "
			
 
				+                "this warning, specify disable_custom_all_reduce=True "
			
 
				+                "explicitly.")
			
 
				             return
			
 
				 
			
 
				         if isinstance(device, int):
			
@@ -215,7 +216,7 @@ class CustomAllreduce:
 
				     def register_graph_buffers(self):
			
 
				         handle, offset = ops.get_graph_buffer_ipc_meta(self._ptr)
			
 
				         handles, offsets = self._gather_ipc_meta((bytes(handle), offset))
			
 
				-        logger.info("Registering %d cuda graph addresses", len(offset))
			
 
				+        logger.info(f"Registering {len(offset)} cuda graph addresses")
			
 
				         ops.register_graph_buffers(self._ptr, handles, offsets)
			
 
				 
			
 
				     def should_custom_ar(self, inp: torch.Tensor):
			
--- a/aphrodite/distributed/device_communicators/pynccl_wrapper.py
+++ b/aphrodite/distributed/device_communicators/pynccl_wrapper.py
@@ -192,14 +192,13 @@ class NCCLLibrary:
 
				             self.lib = NCCLLibrary.path_to_library_cache[so_file]
			
 
				         except Exception as e:
			
 
				             logger.error(
			
 
				-                "Failed to load NCCL library from %s ."
			
 
				+                f"Failed to load NCCL library from {so_file} ."
			
 
				                 "It is expected if you are not running on NVIDIA/AMD GPUs."
			
 
				                 "Otherwise, the nccl library might not exist, be corrupted "
			
 
				-                "or it does not support the current platform %s."
			
 
				-                "If you already have the library, please set the "
			
 
				-                "environment variable VLLM_NCCL_SO_PATH"
			
 
				-                " to point to the correct nccl library path.", so_file,
			
 
				-                platform.platform())
			
 
				+                "or it does not support the current platform "
			
 
				+                f"{platform.platform()}. If you already have the library, "
			
 
				+                "please set the environment variable APHRODITE_NCCL_SO_PATH"
			
 
				+                " to point to the correct nccl library path.")
			
 
				             raise e
			
 
				 
			
 
				         if so_file not in NCCLLibrary.path_to_dict_mapping:
			
--- a/aphrodite/distributed/parallel_state.py
+++ b/aphrodite/distributed/parallel_state.py
@@ -798,9 +798,8 @@ def init_distributed_environment(
 
				     backend: str = "nccl",
			
 
				 ):
			
 
				     logger.debug(
			
 
				-        "world_size=%d rank=%d local_rank=%d "
			
 
				-        "distributed_init_method=%s backend=%s", world_size, rank, local_rank,
			
 
				-        distributed_init_method, backend)
			
 
				+        f"world_size={world_size} rank={rank} local_rank={local_rank} "
			
 
				+        f"distributed_init_method={distributed_init_method} backend={backend}")
			
 
				     if not torch.distributed.is_initialized():
			
 
				         assert distributed_init_method is not None, (
			
 
				             "distributed_init_method must be provided when initializing "
			
@@ -1044,7 +1043,7 @@ def in_the_same_node_as(pg: ProcessGroup, source_rank: int = 0) -> List[bool]:
 
				                 if shm.buf[:len(magic_message)] == magic_message:
			
 
				                     is_in_the_same_node[rank] = 1
			
 
				     except Exception as e:
			
 
				-        logger.error("Error ignored in is_in_the_same_node: %s", e)
			
 
				+        logger.error(f"Error ignored in is_in_the_same_node: {e}")
			
 
				     finally:
			
 
				         if shm:
			
 
				             shm.close()
			
--- a/aphrodite/endpoints/openai/serving_chat.py
+++ b/aphrodite/endpoints/openai/serving_chat.py
@@ -95,7 +95,7 @@ class OpenAIServingChat(OpenAIServing):
 
				                 **(request.chat_template_kwargs or {}),
			
 
				             )
			
 
				         except Exception as e:
			
 
				-            logger.error("Error in applying chat template from request: %s", e)
			
 
				+            logger.error(f"Error in applying chat template from request: {e}")
			
 
				             return self.create_error_response(str(e))
			
 
				 
			
 
				         mm_data: Optional[MultiModalDataDict] = None
			
--- a/aphrodite/lora/models.py
+++ b/aphrodite/lora/models.py
@@ -763,10 +763,9 @@ class LRUCacheLoRAModelManager(LoRAModelManager):
 
				 
			
 
				     def add_adapter(self, lora: LoRAModel) -> bool:
			
 
				         """Add a LoRAModel to the manager."""
			
 
				-        logger.debug(
			
 
				-            "Adding lora. Model id: %d, "
			
 
				-            "int id: %d, "
			
 
				-            "scaling factor: %s", lora.id, lora.id, lora.scaling_factor)
			
 
				+        logger.debug(f"Adding lora. Model id: {lora.id}, "
			
 
				+                     f"int id: {lora.id}, "
			
 
				+                     f"scaling factor: {lora.scaling_factor}")
			
 
				         if lora.id not in self._registered_adapters:
			
 
				             self._add_adapter(lora)
			
 
				             was_added = True
			
--- a/aphrodite/multimodal/base.py
+++ b/aphrodite/multimodal/base.py
@@ -163,9 +163,8 @@ class MultiModalPlugin(ABC):
 
				         def wrapper(model_cls: N) -> N:
			
 
				             if model_cls in self._input_mappers:
			
 
				                 logger.warning(
			
 
				-                    "Model class %s already has an input mapper "
			
 
				-                    "registered to %s. It is overwritten by the new one.",
			
 
				-                    model_cls, self)
			
 
				+                    f"Model class {model_cls} already has an input mapper "
			
 
				+                    f"registered to {self}. It is overwritten by the new one.")
			
 
				 
			
 
				             self._input_mappers[model_cls] = mapper \
			
 
				                 or self._default_input_mapper
			
@@ -227,9 +226,9 @@ class MultiModalPlugin(ABC):
 
				         def wrapper(model_cls: N) -> N:
			
 
				             if model_cls in self._max_mm_tokens:
			
 
				                 logger.warning(
			
 
				-                    "Model class %s already calculates maximum number of "
			
 
				-                    "tokens in %s. It is overwritten by the new one.",
			
 
				-                    model_cls, self)
			
 
				+                    f"Model class {model_cls} already calculates maximum "
			
 
				+                    f"number of tokens in {self}. It is overwritten by the "
			
 
				+                    "new one.")
			
 
				 
			
 
				             if isinstance(max_mm_tokens, int):
			
 
				                 self._validate_max_multimodal_tokens(max_mm_tokens)
			
--- a/aphrodite/multimodal/image.py
+++ b/aphrodite/multimodal/image.py
@@ -65,10 +65,9 @@ def repeat_and_pad_image_tokens(
 
				         image_token_count = prompt.count(image_token_str)
			
 
				         # This is an arbitrary number to distinguish between the two cases
			
 
				         if image_token_count > 16:
			
 
				-            logger.warning(
			
 
				-                "Please follow the prompt format that is "
			
 
				-                "documented on HuggingFace which does not involve "
			
 
				-                "repeating %s tokens.", image_token_str)
			
 
				+            logger.warning("Please follow the prompt format that is "
			
 
				+                           "documented on HuggingFace which does not involve "
			
 
				+                           f"repeating {image_token_str} tokens.")
			
 
				         elif image_token_count > 1:
			
 
				             logger.warning("Multiple image input is not supported yet, "
			
 
				                            "so any extra image tokens will be treated "
			
@@ -120,7 +119,7 @@ class ImagePlugin(MultiModalPlugin):
 
				                     .preprocess(data, return_tensors="pt") \
			
 
				                     .data
			
 
				             except Exception:
			
 
				-                logger.error("Failed to process image (%s)", data)
			
 
				+                logger.error(f"Failed to process image ({data})")
			
 
				                 raise
			
 
				 
			
 
				             return MultiModalInputs(batch_data)
			
--- a/aphrodite/multimodal/registry.py
+++ b/aphrodite/multimodal/registry.py
@@ -32,9 +32,9 @@ class MultiModalRegistry:
 
				 
			
 
				         if data_type_key in self._plugins:
			
 
				             logger.warning(
			
 
				-                "A plugin is already registered for data type %s, "
			
 
				-                "and will be overwritten by the new plugin %s.", data_type_key,
			
 
				-                plugin)
			
 
				+                "A plugin is already registered for data type "
			
 
				+                f"{data_type_key}, "
			
 
				+                f"and will be overwritten by the new plugin {plugin}.")
			
 
				 
			
 
				         self._plugins[data_type_key] = plugin
			
 
				 
			
--- a/aphrodite/prompt_adapter/models.py
+++ b/aphrodite/prompt_adapter/models.py
@@ -169,8 +169,9 @@ class PromptAdapterModelManager(AdapterModelManager):
 
				         index, _ = first_free_slot
			
 
				         self._active_adapters[prompt_adapter_id] = None
			
 
				         prompt_adapter_model = (self._registered_adapters[prompt_adapter_id])
			
 
				-        logger.debug("Activating prompt_adapter. int id: %d, slot index: %d",
			
 
				-                     prompt_adapter_model.id, index)
			
 
				+        logger.debug(f"Activating prompt_adapter. int id: "
			
 
				+                     f"{prompt_adapter_model.id}, "
			
 
				+                     f"slot index: {index}")
			
 
				         self.prompt_adapter_index_to_id[index] = prompt_adapter_model.id
			
 
				         for _, v in self.modules.items():
			
 
				             v.set_prompt_adapter(index, prompt_adapter_model.prompt_embedding)
			
--- a/aphrodite/task_handler/tpu_model_runner.py
+++ b/aphrodite/task_handler/tpu_model_runner.py
@@ -207,7 +207,7 @@ class TPUModelRunner(ModelRunnerBase[ModelInputForTPU]):
 
				             while True:
			
 
				                 self._dummy_run(batch_size, seq_len, kv_caches, is_prompt=True)
			
 
				                 xm.wait_device_ops()
			
 
				-                logger.info("batch_size: %d, seq_len: %d", batch_size, seq_len)
			
 
				+                logger.info(f"batch_size: {batch_size}, seq_len: {seq_len}")
			
 
				 
			
 
				                 if seq_len >= self.model_config.max_model_len:
			
 
				                     break
			
@@ -217,7 +217,7 @@ class TPUModelRunner(ModelRunnerBase[ModelInputForTPU]):
 
				                 seq_len = seq_len * 2
			
 
				 
			
 
				         end = time.time()
			
 
				-        logger.info("Compilation for prefill done in %.2f s.", end - start)
			
 
				+        logger.info(f"Compilation for prefill done in {end - start:.2f} s.")
			
 
				 
			
 
				         # Decode
			
 
				         start = time.time()
			
@@ -226,14 +226,14 @@ class TPUModelRunner(ModelRunnerBase[ModelInputForTPU]):
 
				         while True:
			
 
				             self._dummy_run(batch_size, seq_len, kv_caches, is_prompt=False)
			
 
				             xm.wait_device_ops()
			
 
				-            logger.info("batch_size: %d, seq_len: %d", batch_size, seq_len)
			
 
				+            logger.info(f"batch_size: {batch_size}, seq_len: {seq_len}")
			
 
				 
			
 
				             if batch_size >= self.scheduler_config.max_num_seqs:
			
 
				                 break
			
 
				             batch_size = batch_size + 16 if batch_size >= 16 else batch_size * 2
			
 
				 
			
 
				         end = time.time()
			
 
				-        logger.info("Compilation for decode done in %.2f s.", end - start)
			
 
				+        logger.info(f"Compilation for decode done in {end - start:.2f} s.")
			
 
				 
			
 
				     def _prepare_prompt(
			
 
				         self,