1 gadu atpakaļ · c70abc7522
--- a/aphrodite/endpoints/llm.py
+++ b/aphrodite/endpoints/llm.py
@@ -13,33 +13,46 @@ from aphrodite.common.utils import Counter
 
				 class LLM:
			
 
				     """An LLM for generating texts from given prompts and sampling parameters.
			
 
				 
			
 
				-    This class includes a tokenizer, a language model (possible distributed
			
 
				+    This class includes a tokenizer, a language model (possibly distributed
			
 
				     across multiple GPUs), and GPU memory space allocated for intermediate
			
 
				     states (aka KV cache). Given a batch of prompts and sampling parameters,
			
 
				     this class generates texts from the model, using an intelligent batching
			
 
				     mechanism and efficient memory management.
			
 
				 
			
 
				     NOTE: This class is intended to be used for offline inference. For online
			
 
				-    serving, use the `AsyncAphrodite` class instead.
			
 
				+    serving, use the `AsyncLLMEngine` class instead.
			
 
				     NOTE: For the comprehensive list of arguments, see `EngineArgs`.
			
 
				 
			
 
				     Args:
			
 
				-        model: The name or path of a compatible HuggingFace Transformer model.
			
 
				-        tokenizer: The name or path of a HF Transformers tokenizer.
			
 
				+        model: The name or path of a HuggingFace Transformers model.
			
 
				+        tokenizer: The name or path of a HuggingFace Transformers tokenizer.
			
 
				         tokenizer_mode: The tokenizer mode. "auto" will use the fast tokenizer
			
 
				             if available, and "slow" will always use the slow tokenizer.
			
 
				         trust_remote_code: Trust remote code (e.g., from HuggingFace) when
			
 
				             downloading the model and tokenizer.
			
 
				-        tensor_parallel_size: The number of GPUs to use for distribtuted
			
 
				+        tensor_parallel_size: The number of GPUs to use for distributed
			
 
				             execution with tensor parallelism.
			
 
				-        dtype: The datatype for the model weights and activations. Currently 
			
 
				-            Aphrodite supports `float32`, `float16`, and `bfloat16`. If `auto`
			
 
				-            is used, it'll use the `torch_dtype` attribute specified in the model
			
 
				-            config file. However, if the `torch_dtype` in the config is `float32`,
			
 
				-            we will use `bfloat16` if your GPU supports it, otherwise `float16`.
			
 
				-        seed: The seed to initialize the RNG for sampling.
			
 
				+        dtype: The data type for the model weights and activations. Currently,
			
 
				+            we support `float32`, `float16`, and `bfloat16`. If `auto`, we use
			
 
				+            the `torch_dtype` attribute specified in the model config file.
			
 
				+            However, if the `torch_dtype` in the config is `float32`, we will
			
 
				+            use `float16` instead.
			
 
				+        quantization: The method used to quantize the model weights. Currently,
			
 
				+            we support "awq". If None, we assume the model weights are not
			
 
				+            quantized and use `dtype` to determine the data type of the weights.
			
 
				         revision: The specific model version to use. It can be a branch name,
			
 
				-            a tag name, or a commit ID.
			
 
				+            a tag name, or a commit id.
			
 
				+        seed: The seed to initialize the random number generator for sampling.
			
 
				+        gpu_memory_utilization: The ratio (between 0 and 1) of GPU memory to
			
 
				+            reserve for the model weights, activations, and KV cache. Higher
			
 
				+            values will increase the KV cache size and thus improve the model's
			
 
				+            throughput. However, if the value is too high, it may cause out-of-
			
 
				+            memory (OOM) errors.
			
 
				+        swap_space: The size (GiB) of CPU memory per GPU to use as swap space.
			
 
				+            This can be used for temporarily storing the states of the requests
			
 
				+            when their `best_of` sampling parameters are larger than 1. If all
			
 
				+            requests will have `best_of=1`, you can safely set this to 0.
			
 
				+            Otherwise, too small values may cause out-of-memory (OOM) errors.
			
 
				     """
			
 
				 
			
 
				     def __init__(
			
@@ -50,7 +63,11 @@ class LLM:
 
				         trust_remote_code: bool = False,
			
 
				         tensor_parallel_size: int = 1,
			
 
				         dtype: str = "auto",
			
 
				+        quantization: Optional[str] = None,
			
 
				+        revision: Optional[str] = None,
			
 
				         seed: int = 0,
			
 
				+        gpu_memory_utilization: float = 0.9,
			
 
				+        swap_space: int = 4,
			
 
				         **kwargs,
			
 
				     ) -> None:
			
 
				         if "disable_log_stats" not in kwargs:
			
@@ -62,21 +79,25 @@ class LLM:
 
				             trust_remote_code=trust_remote_code,
			
 
				             tensor_parallel_size=tensor_parallel_size,
			
 
				             dtype=dtype,
			
 
				+            quantization=quantization,
			
 
				+            revision=revision,
			
 
				             seed=seed,
			
 
				+            gpu_memory_utilization=gpu_memory_utilization,
			
 
				+            swap_space=swap_space,
			
 
				             **kwargs,
			
 
				         )
			
 
				-        self.aphrodite_engine = AphroditeEngine.from_engine_args(engine_args)
			
 
				+        self.llm_engine = AphroditeEngine.from_engine_args(engine_args)
			
 
				         self.request_counter = Counter()
			
 
				-    
			
 
				+
			
 
				     def get_tokenizer(
			
 
				             self) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
			
 
				-        return self.aphrodite_engine.tokenizer
			
 
				-    
			
 
				+        return self.llm_engine.tokenizer
			
 
				+
			
 
				     def set_tokenizer(
			
 
				         self,
			
 
				         tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
			
 
				     ) -> None:
			
 
				-        self.aphrodite_engine.tokenizer = tokenizer    
			
 
				+        self.llm_engine.tokenizer = tokenizer
			
 
				 
			
 
				     def generate(
			
 
				         self,
			
@@ -87,34 +108,37 @@ class LLM:
 
				     ) -> List[RequestOutput]:
			
 
				         """Generates the completions for the input prompts.
			
 
				 
			
 
				-        NOTE: This class automatically batches the given prompts, considering the
			
 
				-        memory constraint. For the best performance, put all of your prompts into
			
 
				-        a single list and pass it to this method.
			
 
				+        NOTE: This class automatically batches the given prompts, considering
			
 
				+        the memory constraint. For the best performance, put all of your prompts
			
 
				+        into a single list and pass it to this method.
			
 
				 
			
 
				         Args:
			
 
				             prompts: A list of prompts to generate completions for.
			
 
				-            sampling_params: The sampling parameters for text generation. If None,
			
 
				-                we use the default sampling parameters.
			
 
				+            sampling_params: The sampling parameters for text generation. If
			
 
				+                None, we use the default sampling parameters.
			
 
				             prompt_token_ids: A list of token IDs for the prompts. If None, we
			
 
				                 use the tokenizer to convert the prompts to token IDs.
			
 
				             use_tqdm: Whether to use tqdm to display the progress bar.
			
 
				 
			
 
				         Returns:
			
 
				-            A list of `RequestOutput` objects containing the generated completions
			
 
				-            in the same order as the input prompts.
			
 
				+            A list of `RequestOutput` objects containing the generated
			
 
				+            completions in the same order as the input prompts.
			
 
				         """
			
 
				         if prompts is None and prompt_token_ids is None:
			
 
				             raise ValueError("Either prompts or prompt_token_ids must be "
			
 
				                              "provided.")
			
 
				         if isinstance(prompts, str):
			
 
				+            # Convert a single prompt to a list.
			
 
				             prompts = [prompts]
			
 
				         if prompts is not None and prompt_token_ids is not None:
			
 
				             if len(prompts) != len(prompt_token_ids):
			
 
				                 raise ValueError("The lengths of prompts and prompt_token_ids "
			
 
				                                  "must be the same.")
			
 
				         if sampling_params is None:
			
 
				+            # Use default sampling params.
			
 
				             sampling_params = SamplingParams()
			
 
				 
			
 
				+        # Add requests to the engine.
			
 
				         if prompts is not None:
			
 
				             num_requests = len(prompts)
			
 
				         else:
			
@@ -135,25 +159,27 @@ class LLM:
 
				         prompt_token_ids: Optional[List[int]],
			
 
				     ) -> None:
			
 
				         request_id = str(next(self.request_counter))
			
 
				-        self.aphrodite_engine.add_request(request_id, prompt, sampling_params, prompt_token_ids)
			
 
				+        self.llm_engine.add_request(request_id, prompt, sampling_params,
			
 
				+                                    prompt_token_ids)
			
 
				 
			
 
				     def _run_engine(self, use_tqdm: bool) -> List[RequestOutput]:
			
 
				+        # Initialize tqdm.
			
 
				         if use_tqdm:
			
 
				-            num_requests = self.aphrodite_engine.get_num_unfinished_requests()
			
 
				+            num_requests = self.llm_engine.get_num_unfinished_requests()
			
 
				             pbar = tqdm(total=num_requests, desc="Processed prompts")
			
 
				+        # Run the engine.
			
 
				         outputs: List[RequestOutput] = []
			
 
				-        while self.aphrodite_engine.has_unfinished_requests():
			
 
				-            step_outputs = self.aphrodite_engine.step()
			
 
				+        while self.llm_engine.has_unfinished_requests():
			
 
				+            step_outputs = self.llm_engine.step()
			
 
				             for output in step_outputs:
			
 
				                 if output.finished:
			
 
				                     outputs.append(output)
			
 
				                     if use_tqdm:
			
 
				                         pbar.update(1)
			
 
				-
			
 
				         if use_tqdm:
			
 
				             pbar.close()
			
 
				-            # Sort the outputs by request ID. Necessary because some outputs
			
 
				-            # may be finished earlier than previous requests.
			
 
				+        # Sort the outputs by request ID.
			
 
				+        # This is necessary because some requests may be finished earlier than
			
 
				+        # its previous requests.
			
 
				         outputs = sorted(outputs, key=lambda x: int(x.request_id))
			
 
				         return outputs
			
 
				-