123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216 |
- from typing import List, Optional, Union
- from tqdm import tqdm
- from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
- from aphrodite.lora.request import LoRARequest
- from aphrodite.engine.args_tools import EngineArgs
- from aphrodite.engine.aphrodite_engine import AphroditeEngine
- from aphrodite.common.outputs import RequestOutput
- from aphrodite.common.sampling_params import SamplingParams
- from aphrodite.common.utils import Counter
- class LLM:
- """An LLM for generating texts from given prompts and sampling parameters.
- This class includes a tokenizer, a language model (possibly distributed
- across multiple GPUs), and GPU memory space allocated for intermediate
- states (aka KV cache). Given a batch of prompts and sampling parameters,
- this class generates texts from the model, using an intelligent batching
- mechanism and efficient memory management.
- NOTE: This class is intended to be used for offline inference. For online
- serving, use the `AsyncLLMEngine` class instead.
- NOTE: For the comprehensive list of arguments, see `EngineArgs`.
- Args:
- model: The name or path of a HuggingFace Transformers model.
- tokenizer: The name or path of a HuggingFace Transformers tokenizer.
- tokenizer_mode: The tokenizer mode. "auto" will use the fast tokenizer
- if available, and "slow" will always use the slow tokenizer.
- trust_remote_code: Trust remote code (e.g., from HuggingFace) when
- downloading the model and tokenizer.
- tensor_parallel_size: The number of GPUs to use for distributed
- execution with tensor parallelism.
- dtype: The data type for the model weights and activations. Currently,
- we support `float32`, `float16`, and `bfloat16`. If `auto`, we use
- the `torch_dtype` attribute specified in the model config file.
- However, if the `torch_dtype` in the config is `float32`, we will
- use `float16` instead.
- quantization: The method used to quantize the model weights. Currently,
- we support "awq", "gptq", "quip" and "squeezellm". If None,
- we first check the `quantization_config` attribute in the model
- config file. If that is None, we assume the model weights are not
- quantized and use `dtype` to determine the data type of the weights.
- revision: The specific model version to use. It can be a branch name,
- a tag name, or a commit id.
- seed: The seed to initialize the random number generator for sampling.
- gpu_memory_utilization: The ratio (between 0 and 1) of GPU memory to
- reserve for the model weights, activations, and KV cache. Higher
- values will increase the KV cache size and thus improve the model's
- throughput. However, if the value is too high, it may cause out-of-
- memory (OOM) errors.
- swap_space: The size (GiB) of CPU memory per GPU to use as swap space.
- This can be used for temporarily storing the states of the requests
- when their `best_of` sampling parameters are larger than 1. If all
- requests will have `best_of=1`, you can safely set this to 0.
- Otherwise, too small values may cause out-of-memory (OOM) errors.
- enforce_eager: Whether to enforce eager execution. If True, we will
- disable CUDA graph and always execute the model in eager mode.
- If False, we will use CUDA graph and eager execution in hybrid.
- max_context_len_to_capture: Maximum context len covered by CUDA graphs.
- When a sequence has context length larger than this, we fall back
- to eager mode.
- disable_custom_all_reduce: See ParallelConfig.
- """
- def __init__(
- self,
- model: str,
- tokenizer: Optional[str] = None,
- tokenizer_mode: str = "auto",
- trust_remote_code: bool = False,
- tensor_parallel_size: int = 1,
- dtype: str = "auto",
- quantization: Optional[str] = None,
- revision: Optional[str] = None,
- seed: int = 0,
- gpu_memory_utilization: float = 0.9,
- swap_space: int = 4,
- enforce_eager: bool = False,
- max_context_len_to_capture: int = 8192,
- disable_custom_all_reduce: bool = False,
- **kwargs,
- ) -> None:
- if "disable_log_stats" not in kwargs:
- kwargs["disable_log_stats"] = True
- engine_args = EngineArgs(
- model=model,
- tokenizer=tokenizer,
- tokenizer_mode=tokenizer_mode,
- trust_remote_code=trust_remote_code,
- tensor_parallel_size=tensor_parallel_size,
- dtype=dtype,
- quantization=quantization,
- revision=revision,
- seed=seed,
- gpu_memory_utilization=gpu_memory_utilization,
- swap_space=swap_space,
- enforce_eager=enforce_eager,
- max_context_len_to_capture=max_context_len_to_capture,
- disable_custom_all_reduce=disable_custom_all_reduce,
- **kwargs,
- )
- self.llm_engine = AphroditeEngine.from_engine_args(engine_args)
- self.request_counter = Counter()
- def get_tokenizer(
- self) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
- return self.llm_engine.tokenizer
- def set_tokenizer(
- self,
- tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
- ) -> None:
- self.llm_engine.tokenizer = tokenizer
- def generate(
- self,
- prompts: Optional[Union[str, List[str]]] = None,
- sampling_params: Optional[SamplingParams] = None,
- prompt_token_ids: Optional[List[List[int]]] = None,
- prefix_pos: Optional[Union[int, List[int]]] = None,
- use_tqdm: bool = True,
- lora_request: Optional[LoRARequest] = None,
- ) -> List[RequestOutput]:
- """Generates the completions for the input prompts.
- NOTE: This class automatically batches the given prompts, considering
- the memory constraint. For the best performance, put all of your prompts
- into a single list and pass it to this method.
- Args:
- prompts: A list of prompts to generate completions for.
- sampling_params: The sampling parameters for text generation. If
- None, we use the default sampling parameters.
- prompt_token_ids: A list of token IDs for the prompts. If None, we
- use the tokenizer to convert the prompts to token IDs.
- prefix_pos: If not None, we use the given position as the prefix
- position for each prompt. We will cache the prefix's KV
- cache and reuse it for the next request with the same prefix.
- This is an experimental feature, and may be replaced with
- automatic prefix caching in the future.
- use_tqdm: Whether to use tqdm to display the progress bar.
- lora_request: LoRA request to use for generation, if any.
- Returns:
- A list of `RequestOutput` objects containing the generated
- completions in the same order as the input prompts.
- """
- if prompts is None and prompt_token_ids is None:
- raise ValueError("Either prompts or prompt_token_ids must be "
- "provided.")
- if isinstance(prompts, str):
- # Convert a single prompt to a list.
- prompts = [prompts]
- if prompts is not None and prompt_token_ids is not None:
- if len(prompts) != len(prompt_token_ids):
- raise ValueError("The lengths of prompts and prompt_token_ids "
- "must be the same.")
- if sampling_params is None:
- # Use default sampling params.
- sampling_params = SamplingParams()
- # Add requests to the engine.
- num_requests = len(prompts) if prompts is not None else len(
- prompt_token_ids)
- for i in range(num_requests):
- prompt = prompts[i] if prompts is not None else None
- prefix_pos_i = prefix_pos[i] if prefix_pos is not None else None
- token_ids = None if prompt_token_ids is None else prompt_token_ids[
- i]
- self._add_request(prompt,
- sampling_params,
- token_ids,
- lora_request=lora_request,
- prefix_pos=prefix_pos_i)
- return self._run_engine(use_tqdm)
- def _add_request(
- self,
- prompt: Optional[str],
- sampling_params: SamplingParams,
- prompt_token_ids: Optional[List[int]],
- prefix_pos: Optional[int] = None,
- lora_request: Optional[LoRARequest] = None,
- ) -> None:
- request_id = str(next(self.request_counter))
- self.llm_engine.add_request(request_id,
- prompt,
- sampling_params,
- prompt_token_ids,
- lora_request=lora_request,
- prefix_pos=prefix_pos)
- def _run_engine(self, use_tqdm: bool) -> List[RequestOutput]:
- # Initialize tqdm.
- if use_tqdm:
- num_requests = self.llm_engine.get_num_unfinished_requests()
- pbar = tqdm(total=num_requests, desc="Processed prompts")
- # Run the engine.
- outputs: List[RequestOutput] = []
- while self.llm_engine.has_unfinished_requests():
- step_outputs = self.llm_engine.step()
- for output in step_outputs:
- if output.finished:
- outputs.append(output)
- if use_tqdm:
- pbar.update(1)
- if use_tqdm:
- pbar.close()
- # Sort the outputs by request ID.
- # This is necessary because some requests may be finished earlier than
- # its previous requests.
- outputs = sorted(outputs, key=lambda x: int(x.request_id))
- return outputs
|