123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371 |
- import enum
- import json
- import os
- from dataclasses import dataclass, field, fields
- from typing import TYPE_CHECKING, ClassVar, List, Optional, Tuple, Union
- import torch
- from loguru import logger
- from transformers import PretrainedConfig
- from aphrodite.common.utils import get_cpu_memory, is_cpu, is_hip, is_neuron
- from aphrodite.modeling.models import ModelRegistry
- from aphrodite.quantization import QUANTIZATION_METHODS
- from aphrodite.transformers_utils.config import get_config, get_hf_text_config
- if TYPE_CHECKING:
- from ray.util.placement_group import PlacementGroup
- from aphrodite.modeling.model_loader.loader import BaseModelLoader
- # If true, will load models from ModelScope instead of Hugging Face Hub.
- APHRODITE_USE_MODELSCOPE = os.environ.get("APHRODITE_USE_MODELSCOPE",
- "False").lower() == "true"
- _GB = 1 << 30
- _EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768
- class ModelConfig:
- """Configuration for the model.
- Args:
- model: Name or path of the huggingface model to use.
- tokenizer: Name or path of the huggingface tokenizer to use.
- tokenizer_mode: Tokenizer mode. "auto" will use the fast tokenizer if
- available, and "slow" will always use the slow tokenizer.
- trust_remote_code: Trust remote code (e.g., from HuggingFace) when
- downloading the model and tokenizer.
- dtype: Data type for model weights and activations. The "auto" option
- will use FP16 precision for FP32 and FP16 models, and BF16 precision
- for BF16 models.
- seed: Random seed for reproducibility.
- revision: The specific model version to use. It can be a branch name,
- a tag name, or a commit id. If unspecified, will use the default
- version.
- code_revision: The specific revision to use for the model code on
- Hugging Face Hub. It can be a branch name, a tag name, or a
- commit id. If unspecified, will use the default version.
- rope_scaling: Dictionary containing the scaling configuration for the
- RoPE embeddings. When using this flag, don't update
- `max_position_embeddings` to the expected new maximum.
- tokenizer_revision: The specific tokenizer version to use. It can be a
- branch name, a tag name, or a commit id. If unspecified, will use
- the default version.
- max_model_len: Maximum length of a sequence (including prompt and
- output). If None, will be derived from the model.
- quantization: Quantization method that was used to quantize the model
- weights. If None, we assume the model weights are not quantized.
- load_in_4bit: Whether to load the FP16 model in bitsandbytes 4bit
- format. Works with AWQ models as well as FP16.
- load_in_8bit: Whether to load the FP16 model in 8bit format. Slower
- than load_in_smooth in terms of throughput.
- load_in_smooth: Whether to load the FP16 model in smoothquant format.
- deepspeed_fp_bits: Number of bits to use for DeepSpeed FP quantization.
- Supported number of bits are: 4, 6, 8, 12.
- quantization_param_path: Path to JSON file containing scaling factors.
- Used to load KV cache scaling factors into the model when KV cache
- type is FP8_E4M3 on ROCm (AMD GPU). In the future these will also
- be used to load activation and weight scaling factors when the
- model dtype is FP8_E4M3 on ROCm.
- enforce_eager: Whether to enforce eager execution. If True, we will
- disable CUDA graph and always execute the model in eager mode.
- If False, we will use CUDA graph and eager execution in hybrid.
- max_context_len_to_capture: Maximum context len covered by CUDA graphs.
- When a sequence has context length larger than this, we fall back
- to eager mode (DEPRECATED. Use max_seq_len_to_capture instead).
- max_seq_len_to_capture: Maximum sequence len covered by CUDA graphs.
- When a sequence has context length larger than this, we fall back
- to eager mode
- disable_sliding_window: Whether to disable sliding window. If True,
- we will disable the sliding window functionality of the model.
- If the model does not support sliding window, this argument is
- ignored.
- skip_tokenizer_init: If true, skip initialization of tokenizer and
- detokenizer.
- """
- def __init__(
- self,
- model: str,
- tokenizer: str,
- tokenizer_mode: str,
- trust_remote_code: bool,
- dtype: Union[str, torch.dtype],
- seed: int,
- revision: Optional[str] = None,
- code_revision: Optional[str] = None,
- rope_scaling: Optional[dict] = None,
- tokenizer_revision: Optional[str] = None,
- max_model_len: Optional[int] = None,
- quantization: Optional[str] = None,
- load_in_4bit: bool = False,
- load_in_8bit: bool = False,
- load_in_smooth: bool = False,
- deepspeed_fp_bits: Optional[int] = None,
- quantization_param_path: Optional[str] = None,
- enforce_eager: bool = True,
- max_context_len_to_capture: Optional[int] = None,
- max_seq_len_to_capture: Optional[int] = None,
- max_logprobs: int = 5,
- disable_sliding_window: bool = False,
- skip_tokenizer_init: bool = False,
- ) -> None:
- self.model = model
- self.tokenizer = tokenizer
- self.tokenizer_mode = tokenizer_mode
- self.trust_remote_code = trust_remote_code
- self.seed = seed
- self.revision = revision
- self.code_revision = code_revision
- self.rope_scaling = rope_scaling
- self.tokenizer_revision = tokenizer_revision
- self.quantization = quantization
- self.load_in_4bit = load_in_4bit
- self.load_in_8bit = load_in_8bit
- self.load_in_smooth = load_in_smooth
- self.deepspeed_fp_bits = deepspeed_fp_bits
- self.quantization_param_path = quantization_param_path
- self.enforce_eager = enforce_eager
- self.max_context_len_to_capture = max_context_len_to_capture
- if self.max_context_len_to_capture is not None:
- raise ValueError("`max_context_len_to_capture` is deprecated. "
- "Use `max_seq_len_to_capture` instead.")
- self.max_seq_len_to_capture = (max_seq_len_to_capture
- or max_context_len_to_capture)
- self.max_logprobs = max_logprobs
- self.disable_sliding_window = disable_sliding_window
- self.skip_tokenizer_init = skip_tokenizer_init
- self.hf_config = get_config(self.model, trust_remote_code, revision,
- code_revision, rope_scaling)
- self.hf_text_config = get_hf_text_config(self.hf_config)
- self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
- self.max_model_len = _get_and_verify_max_len(
- hf_config=self.hf_text_config,
- max_model_len=max_model_len,
- disable_sliding_window=self.disable_sliding_window,
- sliding_window_len=self.get_hf_config_sliding_window())
- if (getattr(self.hf_config, "max_position_embeddings", 0) == 131072
- and getattr(self.hf_config, "rope_scaling", None) is None):
- self.hf_config.update({"rope_scaling": {
- "type": "extended",
- }})
- if not self.skip_tokenizer_init:
- self._verify_tokenizer_mode()
- self._verify_embedding_mode()
- self._verify_quantization()
- self._verify_cuda_graph()
- def _verify_tokenizer_mode(self) -> None:
- tokenizer_mode = self.tokenizer_mode.lower()
- if tokenizer_mode not in ["auto", "slow"]:
- raise ValueError(
- f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be "
- "either 'auto' or 'slow'.")
- self.tokenizer_mode = tokenizer_mode
- def _verify_embedding_mode(self) -> None:
- architectures = getattr(self.hf_config, "architectures", [])
- self.embedding_mode = any(
- ModelRegistry.is_embedding_model(arch) for arch in architectures)
- def _verify_quantization(self) -> None:
- supported_quantization = [*QUANTIZATION_METHODS]
- rocm_supported_quantization = ["gptq", "squeezellm"]
- if self.quantization is not None:
- self.quantization = self.quantization.lower()
- # Parse quantization method from the HF model config, if available.
- quant_cfg = getattr(self.hf_config, "quantization_config", None)
- if quant_cfg is not None:
- quant_method = quant_cfg.get("quant_method", "").lower()
- # Detect which checkpoint is it
- for name, method in QUANTIZATION_METHODS.items():
- quantization_override = method.override_quantization_method(
- quant_cfg, self.quantization)
- if quantization_override:
- quant_method = quantization_override
- self.quantization = quantization_override
- break
- # Verify quantization configurations.
- if self.quantization is None:
- self.quantization = quant_method
- elif self.quantization != quant_method:
- raise ValueError(
- "Quantization method specified in the model config "
- f"({quant_method}) does not match the quantization "
- f"method specified in the `quantization` argument "
- f"({self.quantization}).")
- if self.load_in_4bit:
- # the kernels seem to not work with 4bit weight_only
- if torch.cuda.get_device_capability(0)[0] < 8:
- raise ValueError(
- "load_in_4bit quantization is not supported on GPUs with "
- "compute capability less than 8.0.")
- if self.quantization is None:
- self.quantization = "bnb"
- self.hf_config.quantization_config = {
- "bits": 4,
- "quant_mode": "weight_only",
- "quant_method": "bnb",
- "group_size": 128,
- "zero_point": True,
- "from_float": True
- }
- elif self.quantization == "awq":
- logger.warning("AWQ model is being loaded in 4bit bnb format.")
- self.quantization = "bnb"
- self.hf_config.quantization_config = {
- "zero_point": True,
- "q_group_size": 128,
- "w_bit": 4,
- "version": "gemm"
- }
- elif self.quantization != "bnb":
- raise ValueError("4bit quantization is not supported in "
- f"{self.quantization}.")
- if self.load_in_8bit:
- if self.quantization is None:
- self.quantization = "bnb"
- elif self.quantization != "bnb":
- raise ValueError("8bit quantization is not supported in "
- f"{self.quantization}.")
- self.hf_config.quantization_config = {
- "bits": 8,
- "quant_mode": "llm_int8",
- "quant_method": "bnb",
- "group_size": 128,
- "zero_point": True,
- "from_float": True
- }
- self.enforce_eager = True
- if self.load_in_smooth:
- if self.quantization is None:
- self.quantization = "bnb"
- elif self.quantization != "bnb":
- raise ValueError("Smooth quantization is not supported in "
- f"{self.quantization}.")
- self.hf_config.quantization_config = {
- "bits": 8,
- "quant_mode": "smoothquant",
- "quant_method": "bnb",
- "group_size": 128,
- "zero_point": True,
- "from_float": True
- }
- self.enforce_eager = True
- if self.quantization == "deepspeedfp":
- gs = 32 if self.deepspeed_fp_bits == 4 else 128
- self.hf_config.quantization_config = {
- "bits": self.deepspeed_fp_bits,
- "group_size": int(os.environ.get("DEEPSPEED_GROUP_SIZE", gs)),
- "quant_method": "deepspeedfp"
- }
- if self.quantization is not None:
- if self.quantization not in supported_quantization:
- raise ValueError(
- f"Unknown quantization method: {self.quantization}. Must "
- f"be one of {supported_quantization}.")
- if is_hip(
- ) and self.quantization not in rocm_supported_quantization:
- raise ValueError(
- f"{self.quantization} quantization is currently not "
- "supported in ROCm.")
- if (self.quantization
- not in ["marlin", "gptq_marlin_24", "gptq_marlin"]):
- logger.warning(
- f"{self.quantization} quantization is not fully "
- "optimized yet. The speed can be slower than "
- "non-quantized models.")
- if self.quantization == "deepspeedfp" and self.deepspeed_fp_bits \
- is None:
- raise ValueError(
- "deepspeed_fp_bits must be specified when using "
- "deepspeedfp quantization.")
- def _verify_cuda_graph(self) -> None:
- if self.max_seq_len_to_capture is None:
- self.max_seq_len_to_capture = self.max_model_len
- self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
- self.max_model_len)
- def verify_with_parallel_config(
- self,
- parallel_config: "ParallelConfig",
- ) -> None:
- total_num_attention_heads = self.hf_text_config.num_attention_heads
- tensor_parallel_size = parallel_config.tensor_parallel_size
- if total_num_attention_heads % tensor_parallel_size != 0:
- raise ValueError(
- f"Total number of attention heads ({total_num_attention_heads})"
- " must be divisible by tensor parallel size "
- f"({tensor_parallel_size}).")
- total_num_hidden_layers = self.hf_text_config.num_hidden_layers
- pipeline_parallel_size = parallel_config.pipeline_parallel_size
- if total_num_hidden_layers % pipeline_parallel_size != 0:
- raise ValueError(
- f"Total number of hidden layers ({total_num_hidden_layers}) "
- "must be divisible by pipeline parallel size "
- f"({pipeline_parallel_size}).")
- def get_hf_config_sliding_window(self) -> Optional[int]:
- """Get the sliding window size, or None if disabled.
- """
- # Some models, like Qwen2 and Qwen1.5, use `use_sliding_window` in
- # addition to sliding window size. We check if that field is present
- # and if it's False, return None.
- if (hasattr(self.hf_text_config, "use_sliding_window")
- and not self.hf_text_config.use_sliding_window):
- return None
- return getattr(self.hf_text_config, "sliding_window", None)
- def get_sliding_window(self) -> Optional[int]:
- """Get the sliding window size, or None if disabled.
- """
- # If user disables sliding window, return None.
- if self.disable_sliding_window:
- return None
- # Otherwise get the value from the hf config.
- return self.get_hf_config_sliding_window()
- def get_vocab_size(self) -> int:
- return self.hf_text_config.vocab_size
- def get_hidden_size(self) -> int:
- return self.hf_text_config.hidden_size
- def get_head_size(self) -> int:
- if hasattr(self.hf_text_config, "head_dim"):
- return self.hf_text_config.head_dim
- # FIXME: This may not be true for all models.
- return (self.hf_text_config.hidden_size //
- self.hf_text_config.num_attention_heads)
- def get_total_num_kv_heads(self) -> int:
- """Returns the total number of KV heads."""
- # For GPTBigCode & Falcon:
- # NOTE: for falcon, when new_decoder_architecture is True, the
- # multi_query flag is ignored and we use n_head_kv for the number of
- # KV heads.
- falcon_model_types = ["falcon", "RefinedWeb", "RefinedWebModel"]
- new_decoder_arch_falcon = (
- self.hf_config.model_type in falcon_model_types
- and getattr(self.hf_config, "new_decoder_architecture", False))
- if not new_decoder_arch_falcon and getattr(self.hf_text_config,
- "multi_query", False):
- # Multi-query attention, only one KV head.
- # Currently, tensor parallelism is not supported in this case.
- return 1
- # For DBRX and MPT
- if self.hf_config.model_type in ["dbrx", "mpt"]:
- return getattr(self.hf_config.attn_config, "kv_n_heads",
- self.hf_config.num_attention_heads)
- attributes = [
- # For Falcon:
- "n_head_kv",
- "num_kv_heads",
- # For LLaMA-2:
- "num_key_value_heads",
- # For ChatGLM:
- "multi_query_group_num",
- ]
- for attr in attributes:
- num_kv_heads = getattr(self.hf_text_config, attr, None)
- if num_kv_heads is not None:
- return num_kv_heads
- # For non-grouped-query attention models, the number of KV heads is
- # equal to the number of attention heads.
- return self.hf_text_config.num_attention_heads
- def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int:
- """Returns the number of KV heads per GPU."""
- total_num_kv_heads = self.get_total_num_kv_heads()
- # If tensor parallelism is used, we divide the number of KV heads by
- # the tensor parallel size. We will replicate the KV heads in the
- # case where the number of KV heads is smaller than the tensor
- # parallel size so each GPU has at least one KV head.
- return max(1,
- total_num_kv_heads // parallel_config.tensor_parallel_size)
- def get_num_attention_heads(self,
- parallel_config: "ParallelConfig") -> int:
- return self.hf_text_config.num_attention_heads // \
- parallel_config.tensor_parallel_size
- def get_num_layers(self, parallel_config: "ParallelConfig") -> int:
- total_num_hidden_layers = self.hf_text_config.num_hidden_layers
- return total_num_hidden_layers // parallel_config.pipeline_parallel_size
- class CacheConfig:
- """Configuration for the KV cache.
- Args:
- block_size: Size of a cache block in number of tokens.
- gpu_memory_utilization: Fraction of GPU memory to use for the
- Aphrodite execution.
- swap_space: Size of the CPU swap space per GPU (in GiB).
- cache_dtype: Data type for kv cache storage.
- num_gpu_blocks_override: Number of GPU blocks to use. This overrides the
- profiled num_gpu_blocks if specified. Does nothing if None.
- """
- def __init__(
- self,
- block_size: int,
- gpu_memory_utilization: float,
- swap_space: int,
- cache_dtype: str,
- num_gpu_blocks_override: Optional[int] = None,
- sliding_window: Optional[int] = None,
- enable_prefix_caching: bool = False,
- ) -> None:
- self.block_size = block_size
- self.gpu_memory_utilization = gpu_memory_utilization
- self.swap_space_bytes = swap_space * _GB
- self.num_gpu_blocks_override = num_gpu_blocks_override
- self.cache_dtype = cache_dtype
- self.sliding_window = sliding_window
- self.enable_prefix_caching = enable_prefix_caching
- self._verify_args()
- self._verify_cache_dtype()
- self._verify_prefix_caching()
- # Will be set after profiling.
- self.num_gpu_blocks = None
- self.num_cpu_blocks = None
- def metrics_info(self):
- # convert cache_config to dict(key: str, value: str) for prometheus
- # metrics info
- return {key: str(value) for key, value in self.__dict__.items()}
- def _verify_args(self) -> None:
- if self.gpu_memory_utilization > 1.0:
- raise ValueError(
- "GPU memory utilization must be less than 1.0. Got "
- f"{self.gpu_memory_utilization}.")
- def _verify_cache_dtype(self) -> None:
- if self.cache_dtype == "auto":
- pass
- elif self.cache_dtype in ("fp8", "fp8_e4m3", "fp8_e5m2"):
- logger.info(
- "Using fp8 data type to store kv cache. It reduces the GPU "
- "memory footprint and boosts the performance. "
- "Meanwhile, it may cause accuracy drop without a proper "
- "scaling factor")
- else:
- raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}")
- def _verify_prefix_caching(self) -> None:
- if not self.enable_prefix_caching:
- return
- if self.sliding_window is not None:
- raise NotImplementedError(
- "Prefix caching is not supported with sliding window. "
- "Run with --disable-sliding-window to use prefix caching.")
- if self.cache_dtype == "fp8":
- raise NotImplementedError(
- "Prefix caching is not supported for fp8 cache_dtype. "
- "Run with --kv-cache-dtype auto to use prefix caching.")
- def verify_with_parallel_config(
- self,
- parallel_config: "ParallelConfig",
- ) -> None:
- total_cpu_memory = get_cpu_memory()
- # FIXME: Here, it is assumed that the GPUs in a tensor parallel
- # group are in the same node. However, the GPUs may span multiple nodes.
- num_gpus_per_node = parallel_config.tensor_parallel_size
- cpu_memory_usage = self.swap_space_bytes * num_gpus_per_node
- msg = (f"{cpu_memory_usage / _GB:.2f} GiB out of "
- f"the {total_cpu_memory / _GB:.2f} GiB total CPU memory is "
- "allocated for the swap space.")
- if cpu_memory_usage > 0.7 * total_cpu_memory:
- raise ValueError("Too large swap space. " + msg)
- elif cpu_memory_usage > 0.4 * total_cpu_memory:
- logger.warning("Possibly too large swap space. " + msg)
- @dataclass
- class TokenizerPoolConfig:
- """Configuration for the tokenizer pool.
- Args:
- pool_size: Number of tokenizer workers in the pool.
- pool_type: Type of the pool.
- extra_config: Additional config for the pool.
- The way the config will be used depends on the
- pool type.
- """
- pool_size: int
- pool_type: str
- extra_config: dict
- def __post_init__(self):
- if self.pool_type not in ("ray", ):
- raise ValueError(f"Unknown pool type: {self.pool_type}")
- if not isinstance(self.extra_config, dict):
- raise ValueError("extra_config must be a dictionary.")
- @classmethod
- def create_config(
- cls, tokenizer_pool_size: int, tokenizer_pool_type: str,
- tokenizer_pool_extra_config: Optional[Union[str, dict]]
- ) -> Optional["TokenizerPoolConfig"]:
- """Create a TokenizerPoolConfig from the given parameters.
- If tokenizer_pool_size is 0, return None.
- Args:
- tokenizer_pool_size: Number of tokenizer workers in the pool.
- tokenizer_pool_type: Type of the pool.
- tokenizer_pool_extra_config: Additional config for the pool.
- The way the config will be used depends on the
- pool type. This can be a JSON string (will be parsed).
- """
- if tokenizer_pool_size:
- if isinstance(tokenizer_pool_extra_config, str):
- tokenizer_pool_extra_config_parsed = json.loads(
- tokenizer_pool_extra_config)
- else:
- tokenizer_pool_extra_config_parsed = (
- tokenizer_pool_extra_config or {})
- tokenizer_pool_config = cls(tokenizer_pool_size,
- tokenizer_pool_type,
- tokenizer_pool_extra_config_parsed)
- else:
- tokenizer_pool_config = None
- return tokenizer_pool_config
- class LoadFormat(str, enum.Enum):
- AUTO = "auto"
- PT = "pt"
- SAFETENSORS = "safetensors"
- NPCACHE = "npcache"
- DUMMY = "dummy"
- TENSORIZER = "tensorizer"
- SHARDED_STATE = "sharded_state"
- @dataclass
- class LoadConfig:
- """
- download_dir: Directory to download and load the weights, default to the
- default cache directory of huggingface.
- load_format: The format of the model weights to load:
- "auto" will try to load the weights in the safetensors format and
- fall back to the pytorch bin format if safetensors format is
- not available.
- "pt" will load the weights in the pytorch bin format.
- "safetensors" will load the weights in the safetensors format.
- "npcache" will load the weights in pytorch format and store
- a numpy cache to speed up the loading.
- "dummy" will initialize the weights with random values, which is
- mainly for profiling.
- "tensorizer" will use CoreWeave's tensorizer library for
- fast weight loading.
- """
- load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO
- download_dir: Optional[str] = None
- model_loader_extra_config: Optional[Union[str, dict]] = field(
- default_factory=dict)
- def __post_init__(self):
- model_loader_extra_config = self.model_loader_extra_config or {}
- if isinstance(model_loader_extra_config, str):
- self.model_loader_extra_config = json.loads(
- model_loader_extra_config)
- self._verify_load_format()
- def _verify_load_format(self) -> None:
- if not isinstance(self.load_format, str):
- return
- load_format = self.load_format.lower()
- self.load_format = LoadFormat(load_format)
- rocm_not_supported_load_format: List[str] = []
- if is_hip() and load_format in rocm_not_supported_load_format:
- rocm_supported_load_format = [
- f for f in LoadFormat.__members__
- if (f not in rocm_not_supported_load_format)
- ]
- raise ValueError(
- f"load format '{load_format}' is not supported in ROCm. "
- f"Supported load formats are "
- f"{rocm_supported_load_format}")
- class ParallelConfig:
- """Configuration for the distributed execution.
- Args:
- pipeline_parallel_size: Number of pipeline parallel groups.
- tensor_parallel_size: Number of tensor parallel groups.
- worker_use_ray: Deprecated, use distributed_executor_backend instead.
- max_parallel_loading_workers: Maximum number of multiple batches
- when load model sequentially. To avoid RAM OOM when using tensor
- parallel and large models.
- disable_custom_all_reduce: Disable the custom all-reduce kernel and
- fall back to NCCL.
- tokenizer_pool_config: Config for the tokenizer pool.
- If None, will use synchronous tokenization.
- ray_workers_use_nsight: Whether to profile Ray workers with nsight, see
- https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler.
- placement_group: ray distributed model workers placement group.
- distributed_executor_backend: Backend to use for distributed model
- workers, either "ray" or "mp" (multiprocessing). If either
- pipeline_parallel_size or tensor_parallel_size is greater than 1,
- will default to "ray" if Ray is installed or "mp" otherwise.
- """
- def __init__(
- self,
- pipeline_parallel_size: int,
- tensor_parallel_size: int,
- worker_use_ray: Optional[bool] = None,
- max_parallel_loading_workers: Optional[int] = None,
- disable_custom_all_reduce: bool = False,
- tokenizer_pool_config: Optional[TokenizerPoolConfig] = None,
- ray_workers_use_nsight: bool = False,
- placement_group: Optional["PlacementGroup"] = None,
- distributed_executor_backend: Optional[str] = None,
- ) -> None:
- self.pipeline_parallel_size = pipeline_parallel_size
- self.tensor_parallel_size = tensor_parallel_size
- self.distributed_executor_backend = distributed_executor_backend
- self.max_parallel_loading_workers = max_parallel_loading_workers
- self.disable_custom_all_reduce = disable_custom_all_reduce
- self.tokenizer_pool_config = tokenizer_pool_config
- self.ray_workers_use_nsight = ray_workers_use_nsight
- self.placement_group = placement_group
- self.world_size = pipeline_parallel_size * self.tensor_parallel_size
- if worker_use_ray:
- if self.distributed_executor_backend is None:
- self.distributed_executor_backend = "ray"
- elif self.distributed_executor_backend != "ray":
- raise ValueError(f"worker-use-ray can't be used with "
- f"distributed executor backend "
- f"'{self.distributed_executor_backend}'.")
- if self.distributed_executor_backend is None and self.world_size > 1:
- from aphrodite.executor import ray_utils
- ray_found = ray_utils.ray is not None
- self.distributed_executor_backend = "ray" if ray_found else "mp"
- self._verify_args()
- def _verify_args(self) -> None:
- if self.pipeline_parallel_size > 1:
- raise NotImplementedError(
- "Pipeline parallelism is not supported yet.")
- if self.distributed_executor_backend not in ("ray", "mp", None):
- raise ValueError(
- "Unrecognized distributed executor backend. Supported values "
- "are 'ray' or 'mp'.")
- if not self.disable_custom_all_reduce and self.world_size > 1:
- if is_hip():
- self.disable_custom_all_reduce = True
- logger.info(
- "Disabled the custom all-reduce kernel because it is not "
- "supported on AMD GPUs.")
- elif self.pipeline_parallel_size > 1:
- self.disable_custom_all_reduce = True
- logger.info(
- "Disabled the custom all-reduce kernel because it is not "
- "supported with pipeline parallelism.")
- if self.ray_workers_use_nsight and (
- not self.distributed_executor_backend == "ray"):
- raise ValueError("Unable to use nsight profiling unless workers "
- "run with Ray.")
- class SchedulerConfig:
- """Scheduler configuration.
- Args:
- max_num_batched_tokens: Maximum number of tokens to be processed in
- a single iteration.
- max_num_seqs: Maximum number of sequences to be processed in a single
- iteration.
- max_model_len: Maximum length of a sequence (including prompt
- and generated text).
- use_v2_block_manager: Whether to use the BlockSpaceManagerV2 or not.
- num_lookahead_slots: The number of slots to allocate per sequence per
- step, beyond the known token ids. This is used in speculative
- decoding to store KV activations of tokens which may or may not be
- accepted.
- delay_factor: Apply a delay (of delay factor multiplied by previous
- prompt latency) before scheduling next prompt.
- enable_chunked_prefill: If True, prefill requests can be chunked based
- on the remaining max_num_batched_tokens.
- embedding_mode: Whether the running model is for embedding.
- """
- def __init__(
- self,
- max_num_batched_tokens: Optional[int],
- max_num_seqs: int,
- max_model_len: int,
- use_v2_block_manager: bool = False,
- num_lookahead_slots: int = 0,
- delay_factor: float = 0.0,
- enable_chunked_prefill: bool = False,
- embedding_mode: Optional[bool] = None,
- ) -> None:
- if max_num_batched_tokens is not None:
- self.max_num_batched_tokens = max_num_batched_tokens
- else:
- if enable_chunked_prefill:
- # For chunked prefill, choose the well-tuned batch size.
- self.max_num_batched_tokens = 768
- elif embedding_mode:
- # For embedding, choose specific value for higher throughput
- self.max_num_batched_tokens = max(
- max_model_len, _EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS)
- else:
- # If max_model_len is too short, use 2048 as the default value
- # for higher throughput.
- self.max_num_batched_tokens = max(max_model_len, 2048)
- if enable_chunked_prefill:
- logger.info("Chunked prefill is enabled (EXPERIMENTAL).")
- self.max_num_seqs = max_num_seqs
- self.max_model_len = max_model_len
- self.use_v2_block_manager = use_v2_block_manager
- self.num_lookahead_slots = num_lookahead_slots
- self.delay_factor = delay_factor
- self.chunked_prefill_enabled = enable_chunked_prefill
- self.embedding_mode = embedding_mode
- self._verify_args()
- def _verify_args(self) -> None:
- if (self.max_num_batched_tokens < self.max_model_len
- and not self.chunked_prefill_enabled):
- raise ValueError(
- f"max_num_batched_tokens ({self.max_num_batched_tokens}) is "
- f"smaller than max_model_len ({self.max_model_len}). "
- "This effectively limits the maximum sequence length to "
- "max_num_batched_tokens and makes Aphrodite reject longer "
- "sequences. Please increase max_num_batched_tokens or "
- "decrease max_model_len.")
- if self.max_num_batched_tokens < self.max_num_seqs:
- raise ValueError(
- f"max_num_batched_tokens ({self.max_num_batched_tokens}) must "
- "be greater than or equal to max_num_seqs "
- f"({self.max_num_seqs}).")
- if self.num_lookahead_slots < 0:
- raise ValueError(
- "num_lookahead_slots "
- f"({self.num_lookahead_slots}) must be greater than or "
- "equal to 0.")
- class DeviceConfig:
- def __init__(self, device: str = "auto") -> None:
- if device == "auto":
- # Automated device type detection
- if is_neuron():
- self.device_type = "neuron"
- elif is_cpu():
- self.device_type = "cpu"
- else:
- # We don't call torch.cuda.is_available() here to
- # avoid initializing CUDA before workers are forked
- self.device_type = "cuda"
- else:
- # Device type is assigned explicitly
- self.device_type = device
- # Some device types require processing inputs on CPU
- if self.device_type in ["neuron"]:
- self.device = torch.device("cpu")
- else:
- # Set device with device type
- self.device = torch.device(self.device_type)
- class SpeculativeConfig:
- """Configuration for speculative decoding.
- The configuration is currently specialized to draft-model speculative
- decoding with top-1 proposals.
- """
- @staticmethod
- def maybe_create_spec_config(
- target_model_config: ModelConfig,
- target_parallel_config: ParallelConfig,
- target_dtype: str,
- speculative_model: Optional[str],
- num_speculative_tokens: Optional[int],
- speculative_max_model_len: Optional[int],
- enable_chunked_prefill: bool,
- use_v2_block_manager: bool,
- speculative_disable_by_batch_size: Optional[int],
- ngram_prompt_lookup_max: Optional[int],
- ngram_prompt_lookup_min: Optional[int],
- ) -> Optional["SpeculativeConfig"]:
- """Create a SpeculativeConfig if possible, else return None.
- This function attempts to create a SpeculativeConfig object based on the
- provided parameters. If the necessary conditions are met, it returns an
- instance of SpeculativeConfig. Otherwise, it returns None.
- Args:
- target_model_config (ModelConfig): The configuration of the target
- model.
- target_parallel_config (ParallelConfig): The parallel configuration
- for the target model.
- target_dtype (str): The data type used for the target model.
- speculative_model (Optional[str]): The name of the speculative
- model, if provided.
- num_speculative_tokens (Optional[int]): The number of speculative
- tokens, if provided.
- speculative_max_model_len (Optional[int]): The maximum model len of
- the speculative model. Used when testing the ability to skip
- speculation for some sequences.
- enable_chunked_prefill (bool): Whether Aphrodite is configured to
- use chunked prefill or not. Used for raising an error since its
- not yet compatible with spec decode.
- use_v2_block_manager (bool): Whether Aphrodite is configured to
- use the v2 block manager or not. Used for raising an error
- since the v2 block manager is required with spec decode.
- speculative_disable_by_batch_size (Optional[int]): Disable
- speculative decoding for new incoming requests when the number
- of enqueue requests is larger than this value, if provided.
- ngram_prompt_lookup_max (Optional[int]): Max size of ngram token
- window, if provided.
- ngram_prompt_lookup_min (Optional[int]): Min size of ngram token
- window, if provided.
- Returns:
- Optional["SpeculativeConfig"]: An instance of SpeculativeConfig if
- the necessary conditions are met, else None.
- """
- if speculative_model is None and num_speculative_tokens is None:
- return None
- if speculative_model is not None and num_speculative_tokens is None:
- raise ValueError(
- "Expected both speculative_model and "
- "num_speculative_tokens to be provided, but found "
- f"{speculative_model=} and {num_speculative_tokens=}.")
- if (speculative_disable_by_batch_size is not None
- and speculative_disable_by_batch_size < 2):
- raise ValueError("Expected the batch size threshold of disabling "
- "speculative decoding is > 1, but got "
- f"{speculative_disable_by_batch_size=}")
- assert (speculative_model is not None
- and num_speculative_tokens is not None)
- if enable_chunked_prefill:
- raise ValueError(
- "Speculative decoding and chunked prefill are "
- f"currently mutually exclusive ({enable_chunked_prefill=}).")
- if not use_v2_block_manager:
- raise ValueError(
- "Speculative decoding requires usage of the V2 "
- "block manager. Enable it with --use-v2-block-manager.")
- # TODO: The user should be able to specify revision/quantization/max
- # model len for the draft model. It is not currently supported.
- draft_revision = None
- draft_code_revision = None
- draft_quantization = None
- if speculative_model == "[ngram]":
- if ngram_prompt_lookup_min is None:
- ngram_prompt_lookup_min = 1
- if ngram_prompt_lookup_max is None or ngram_prompt_lookup_max < 1:
- raise ValueError(f"{ngram_prompt_lookup_max=} must be > 0")
- if ngram_prompt_lookup_min < 1:
- raise ValueError(f"{ngram_prompt_lookup_min=} must be > 0")
- if ngram_prompt_lookup_min > ngram_prompt_lookup_max:
- raise ValueError(f"{ngram_prompt_lookup_min=} cannot be "
- f"larger than {ngram_prompt_lookup_max=}")
- # TODO: current we still need extract vocab_size from target model
- # config, in future, we may try refactoring it out, and set
- # draft related config as None here.
- draft_model_config = target_model_config
- draft_parallel_config = target_parallel_config
- else:
- ngram_prompt_lookup_max = 0
- ngram_prompt_lookup_min = 0
- draft_model_config = ModelConfig(
- model=speculative_model,
- tokenizer=target_model_config.tokenizer,
- tokenizer_mode=target_model_config.tokenizer_mode,
- trust_remote_code=target_model_config.trust_remote_code,
- dtype=target_model_config.dtype,
- seed=target_model_config.seed,
- revision=draft_revision,
- code_revision=draft_code_revision,
- tokenizer_revision=target_model_config.tokenizer_revision,
- max_model_len=None,
- quantization=draft_quantization,
- enforce_eager=target_model_config.enforce_eager,
- max_seq_len_to_capture=target_model_config.
- max_seq_len_to_capture,
- max_logprobs=target_model_config.max_logprobs,
- )
- draft_model_config.max_model_len = (
- SpeculativeConfig._maybe_override_draft_max_model_len(
- speculative_max_model_len,
- draft_model_config.max_model_len,
- target_model_config.max_model_len,
- ))
- draft_parallel_config = (
- SpeculativeConfig.create_draft_parallel_config(
- target_parallel_config))
- return SpeculativeConfig(draft_model_config, draft_parallel_config,
- num_speculative_tokens,
- speculative_disable_by_batch_size,
- ngram_prompt_lookup_max,
- ngram_prompt_lookup_min)
- @staticmethod
- def _maybe_override_draft_max_model_len(
- speculative_max_model_len: Optional[int],
- draft_max_model_len: int,
- target_max_model_len: int,
- ) -> int:
- """Determine the max sequence len for the draft model. This is usually
- the draft_max_model_len, but may be the target_max_model_len if it is
- less than the draft_max_model_len, or may be speculative_max_model_len
- if it is specified.
- This is necessary so that sequences do not exceed the capacity of the
- draft model or the target model.
- speculative_max_model_len is mainly used for testing that sequences can
- skip speculation.
- """
- if speculative_max_model_len is not None:
- if speculative_max_model_len > draft_max_model_len:
- raise ValueError(f"{speculative_max_model_len=} cannot be "
- f"larger than {draft_max_model_len=}")
- if speculative_max_model_len > target_max_model_len:
- raise ValueError(f"{speculative_max_model_len=} cannot be "
- f"larger than {target_max_model_len=}")
- return speculative_max_model_len
- return min(
- draft_max_model_len,
- target_max_model_len,
- )
- @staticmethod
- def create_draft_parallel_config(
- target_parallel_config: ParallelConfig) -> ParallelConfig:
- """Create a parallel config for use by the draft worker.
- This is mostly a copy of the target parallel config. In the future the
- draft worker can have a different parallel strategy, e.g. TP=1.
- """
- draft_parallel_config = ParallelConfig(
- pipeline_parallel_size=target_parallel_config.
- pipeline_parallel_size,
- tensor_parallel_size=target_parallel_config.tensor_parallel_size,
- distributed_executor_backend=target_parallel_config.
- distributed_executor_backend,
- max_parallel_loading_workers=target_parallel_config.
- max_parallel_loading_workers,
- disable_custom_all_reduce=target_parallel_config.
- disable_custom_all_reduce,
- tokenizer_pool_config=target_parallel_config.tokenizer_pool_config,
- ray_workers_use_nsight=target_parallel_config.
- ray_workers_use_nsight,
- placement_group=target_parallel_config.placement_group,
- )
- return draft_parallel_config
- def __init__(
- self,
- draft_model_config: ModelConfig,
- draft_parallel_config: ParallelConfig,
- num_speculative_tokens: int,
- speculative_disable_by_batch_size: Optional[int],
- ngram_prompt_lookup_max: Optional[int],
- ngram_prompt_lookup_min: Optional[int],
- ):
- """Create a SpeculativeConfig object.
- Args:
- draft_model_config: ModelConfig for the draft model.
- draft_parallel_config: ParallelConfig for the draft model.
- num_speculative_tokens: The number of tokens to sample from the
- draft model before scoring with the target model.
- speculative_disable_by_batch_size: Disable speculative
- decoding for new incoming requests when the number of
- enqueue requests is larger than this value.
- ngram_prompt_lookup_max: Max size of ngram token window.
- ngram_prompt_lookup_min: Min size of ngram token window.
- """
- self.draft_model_config = draft_model_config
- self.draft_parallel_config = draft_parallel_config
- self.num_speculative_tokens = num_speculative_tokens
- self.speculative_disable_by_batch_size = \
- speculative_disable_by_batch_size
- self.ngram_prompt_lookup_max = ngram_prompt_lookup_max or 0
- self.ngram_prompt_lookup_min = ngram_prompt_lookup_min or 0
- self._verify_args()
- def _verify_args(self) -> None:
- if self.num_speculative_tokens <= 0:
- raise ValueError("Expected num_speculative_tokens to be greater "
- f"than zero ({self.num_speculative_tokens}).")
- if self.draft_model_config:
- self.draft_model_config.verify_with_parallel_config(
- self.draft_parallel_config)
- @property
- def num_lookahead_slots(self) -> int:
- """The number of additional slots the scheduler should allocate per
- step, in addition to the slots allocated for each known token.
- This is equal to the number of speculative tokens, as each speculative
- token must be scored.
- """
- return self.num_speculative_tokens
- def __repr__(self) -> str:
- if self.ngram_prompt_lookup_max > 0:
- draft_model = "[ngram]"
- else:
- draft_model = self.draft_model_config.model
- num_spec_tokens = self.num_speculative_tokens
- return f"SpeculativeConfig({draft_model=}, {num_spec_tokens=})"
- @dataclass
- class LoRAConfig:
- max_lora_rank: int
- max_loras: int
- fully_sharded_loras: bool = False
- max_cpu_loras: Optional[int] = None
- lora_dtype: Optional[torch.dtype] = None
- lora_extra_vocab_size: int = 256
- # This is a constant.
- lora_vocab_padding_size: ClassVar[int] = 256
- long_lora_scaling_factors: Optional[Tuple[float]] = None
- def __post_init__(self):
- # Keep this in sync with kernels/punica/bgmv/bgmv_config.h
- possible_max_ranks = (8, 16, 32, 64)
- possible_lora_extra_vocab_size = (0, 256, 512)
- if self.max_lora_rank not in possible_max_ranks:
- raise ValueError(
- f"max_lora_rank ({self.max_lora_rank}) must be one of "
- f"{possible_max_ranks}.")
- if self.lora_extra_vocab_size not in possible_lora_extra_vocab_size:
- raise ValueError(
- f"lora_extra_vocab_size ({self.lora_extra_vocab_size}) "
- f"must be one of {possible_lora_extra_vocab_size}.")
- if self.max_loras < 1:
- raise ValueError(f"max_loras ({self.max_loras}) must be >= 1.")
- if self.max_cpu_loras is None:
- self.max_cpu_loras = self.max_loras
- elif self.max_cpu_loras < self.max_loras:
- raise ValueError(
- f"max_cpu_loras ({self.max_cpu_loras}) must be >= "
- f"max_loras ({self.max_loras})")
- def verify_with_model_config(self, model_config: ModelConfig):
- if self.lora_dtype in (None, "auto"):
- self.lora_dtype = model_config.dtype
- elif isinstance(self.lora_dtype, str):
- self.lora_dtype = getattr(torch, self.lora_dtype)
- if model_config.quantization and model_config.quantization not in [
- "awq", "gptq"
- ]:
- # TODO support all other quants
- logger.warning(f"{model_config.quantization} quantization is not "
- "tested with LoRA yet.")
- def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
- if scheduler_config.max_num_batched_tokens > 65528:
- raise ValueError(
- "Due to limitations of the custom LoRA CUDA kernel, "
- "max_num_batched_tokens must be <= 65528 when "
- "LoRA is enabled.")
- @dataclass
- class VisionLanguageConfig:
- """Configs the input data format and how models should run for
- vision language models."""
- class ImageInputType(enum.Enum):
- """Image input type into the vision language model.
- An image roughly goes through the following transformation:
- Raw image --> pixel values --> image features --> image embeddings.
- The difference between different image input types is where the
- image encoder (pixel values --> image features) is run.
- Different image input types also correspond to different tensor shapes.
- For example, for Llava, PIXEL_VALUES: (1, 3, 336, 336).
- IMAGE_FEATURES: (1, 576, 1024).
- """
- PIXEL_VALUES = enum.auto()
- IMAGE_FEATURES = enum.auto()
- image_input_type: ImageInputType
- # The input id corresponding to image token.
- image_token_id: int
- # Used for running `run_prefill_max_token`.
- # For models that support varying resolution, this corresponds to
- # worst case scenario (biggest supported resolution).
- image_input_shape: tuple
- image_feature_size: int
- @classmethod
- def get_image_input_enum_type(
- cls, value: str) -> "VisionLanguageConfig.ImageInputType":
- """Get the image input type from a string."""
- try:
- return cls.ImageInputType[value.upper()]
- except KeyError as e:
- raise ValueError(f"{value} is not a valid choice. "
- f"Expecting to choose from "
- f"{[x.name for x in cls.ImageInputType]}.") from e
- _STR_DTYPE_TO_TORCH_DTYPE = {
- "half": torch.float16,
- "float16": torch.float16,
- "float": torch.float32,
- "float32": torch.float32,
- "bfloat16": torch.bfloat16,
- }
- _ROCM_NOT_SUPPORTED_DTYPE = ["float", "float32"]
- def _get_and_verify_dtype(
- config: PretrainedConfig,
- dtype: Union[str, torch.dtype],
- ) -> torch.dtype:
- # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
- # because config.torch_dtype can be None.
- config_dtype = getattr(config, "torch_dtype", None)
- if config_dtype is None:
- config_dtype = torch.float32
- if isinstance(dtype, str):
- dtype = dtype.lower()
- if dtype == "auto":
- if config_dtype == torch.float32:
- # Following the common practice, we use float16 for float32
- # models.
- torch_dtype = torch.float16
- else:
- torch_dtype = config_dtype
- else:
- if dtype not in _STR_DTYPE_TO_TORCH_DTYPE:
- raise ValueError(f"Unknown dtype: {dtype}")
- torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype]
- elif isinstance(dtype, torch.dtype):
- torch_dtype = dtype
- else:
- raise ValueError(f"Unknown dtype: {dtype}")
- if is_hip() and torch_dtype == torch.float32:
- rocm_supported_dtypes = [
- k for k, v in _STR_DTYPE_TO_TORCH_DTYPE.items()
- if (k not in _ROCM_NOT_SUPPORTED_DTYPE)
- ]
- raise ValueError(f"dtype '{dtype}' is not supported in ROCm. "
- f"Supported dtypes are {rocm_supported_dtypes}")
- # Verify the dtype.
- if torch_dtype != config_dtype:
- if torch_dtype == torch.float32:
- # Upcasting to float32 is allowed.
- pass
- elif config_dtype == torch.float32:
- # Downcasting from float32 to float16 or bfloat16 is allowed.
- pass
- else:
- # Casting between float16 and bfloat16 is allowed with a warning.
- logger.warning(f"Casting {config_dtype} to {torch_dtype}.")
- return torch_dtype
- def _get_and_verify_max_len(
- hf_config: PretrainedConfig,
- max_model_len: Optional[int],
- disable_sliding_window: bool,
- sliding_window_len: Optional[int],
- ) -> int:
- """Get and verify the model's maximum length."""
- derived_max_model_len = float("inf")
- possible_keys = [
- # Cohere: needs to prioritize this over "max_position_embeddings"
- "model_max_length",
- # OPT
- "max_position_embeddings",
- # GPT-2
- "n_positions",
- # MPT
- "max_seq_len",
- # ChatGLM2
- "seq_length",
- # Command-R
- "model_max_length",
- # Others
- "max_sequence_length",
- "max_seq_length",
- "seq_len",
- ]
- # Choose the smallest "max_length" from the possible keys.
- max_len_key = None
- for key in possible_keys:
- max_len = getattr(hf_config, key, None)
- if max_len is not None:
- max_len_key = key if max_len < derived_max_model_len \
- else max_len_key
- derived_max_model_len = min(derived_max_model_len, max_len)
- # If sliding window is manually disabled, max_length should be less
- # than the sliding window length in the model config.
- if disable_sliding_window and sliding_window_len is not None:
- max_len_key = "sliding_window" \
- if sliding_window_len < derived_max_model_len else max_len_key
- derived_max_model_len = min(derived_max_model_len, sliding_window_len)
- # If none of the keys were found in the config, use a default and
- # log a warning.
- if derived_max_model_len == float("inf"):
- if max_model_len is not None:
- # If max_model_len is specified, we use it.
- return max_model_len
- default_max_len = 2048
- logger.warning(
- "The model's config.json does not contain any of the following "
- "keys to determine the original maximum length of the model: "
- f"{possible_keys}. Assuming the model's maximum length is "
- f"{default_max_len}.")
- derived_max_model_len = default_max_len
- rope_scaling = getattr(hf_config, "rope_scaling", None)
- if rope_scaling is not None:
- rope_type = rope_scaling.get("type", rope_scaling.get("rope_type"))
- if rope_type not in {"su", "longrope", "llama3"}:
- if disable_sliding_window:
- # TODO: Find a model that supports rope_scaling
- # with sliding window to see if this case should be allowed.
- raise NotImplementedError(
- "Disabling sliding window is not supported for models "
- "with rope_scaling. Please raise an issue so we can "
- "investigate.")
- assert "factor" in rope_scaling
- scaling_factor = rope_scaling["factor"]
- if rope_type == "yarn":
- derived_max_model_len = rope_scaling[
- "original_max_position_embeddings"]
- derived_max_model_len *= scaling_factor
- if max_model_len is None:
- max_model_len = derived_max_model_len
- elif max_model_len > derived_max_model_len:
- # hope this works
- scaling_factor = max_model_len / derived_max_model_len
- hf_config.rope_scaling = {"factor": scaling_factor, "type": "dynamic"}
- logger.warning(
- f"User-specified max_model_len {max_model_len} is higher than "
- f"the original {derived_max_model_len}. "
- "Attempting to use RoPE scaling.")
- derived_max_model_len = max_model_len
- return int(max_model_len)
- @dataclass
- class DecodingConfig:
- """Dataclass which contains the decoding strategy of the engine"""
- # Which guided decoding algo to use. 'outlines' / 'lm-format-enforcer'
- guided_decoding_backend: str = 'outlines'
- def __post_init__(self):
- valid_guided_backends = ['outlines', 'lm-format-enforcer']
- backend = self.guided_decoding_backend
- if backend not in valid_guided_backends:
- raise ValueError(f"Invalid guided_decoding_backend '{backend},"
- f"must be one of {valid_guided_backends}")
- @dataclass(frozen=True)
- class EngineConfig:
- """Dataclass which contains all engine-related configuration. This
- simplifies passing around the distinct configurations in the codebase.
- """
- model_config: ModelConfig
- cache_config: CacheConfig
- parallel_config: ParallelConfig
- scheduler_config: SchedulerConfig
- device_config: DeviceConfig
- load_config: LoadConfig
- lora_config: Optional[LoRAConfig]
- vision_language_config: Optional[VisionLanguageConfig]
- speculative_config: Optional[SpeculativeConfig]
- decoding_config: Optional[DecodingConfig]
- def __post_init__(self):
- """Verify configs are valid & consistent with each other.
- """
- self.model_config.verify_with_parallel_config(self.parallel_config)
- self.cache_config.verify_with_parallel_config(self.parallel_config)
- if self.lora_config:
- self.lora_config.verify_with_model_config(self.model_config)
- self.lora_config.verify_with_scheduler_config(
- self.scheduler_config)
- def to_dict(self):
- """Return the configs as a dictionary, for use in **kwargs.
- """
- return dict(
- (field.name, getattr(self, field.name)) for field in fields(self))
|