1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207 |
- import enum
- from typing import TYPE_CHECKING, Optional, Union, ClassVar
- from dataclasses import dataclass, fields
- import os
- from packaging.version import Version
- from loguru import logger
- import json
- import torch
- from transformers import PretrainedConfig
- from aphrodite.transformers_utils.config import get_config, get_hf_text_config
- from aphrodite.common.utils import (get_cpu_memory, is_cpu, is_hip, is_neuron,
- get_nvcc_cuda_version)
- if TYPE_CHECKING:
- from ray.util.placement_group import PlacementGroup
- _GB = 1 << 30
- class ModelConfig:
- """Configuration for the model.
- Args:
- model: Name or path of the huggingface model to use.
- tokenizer: Name or path of the huggingface tokenizer to use.
- tokenizer_mode: Tokenizer mode. "auto" will use the fast tokenizer if
- available, and "slow" will always use the slow tokenizer.
- trust_remote_code: Trust remote code (e.g., from HuggingFace) when
- downloading the model and tokenizer.
- download_dir: Directory to download and load the weights, default to the
- default cache directory of huggingface.
- load_format: The format of the model weights to load:
- "auto" will try to load the weights in the safetensors format and
- fall back to the pytorch bin format if safetensors format is
- not available.
- "pt" will load the weights in the pytorch bin format.
- "safetensors" will load the weights in the safetensors format.
- "npcache" will load the weights in pytorch format and store
- a numpy cache to speed up the loading.
- "dummy" will initialize the weights with random values, which is
- mainly for profiling.
- dtype: Data type for model weights and activations. The "auto" option
- will use FP16 precision for FP32 and FP16 models, and BF16 precision
- for BF16 models.
- seed: Random seed for reproducibility.
- revision: The specific model version to use. It can be a branch name,
- a tag name, or a commit id. If unspecified, will use the default
- version.
- code_revision: The specific revision to use for the model code on
- Hugging Face Hub. It can be a branch name, a tag name, or a
- commit id. If unspecified, will use the default version.
- tokenizer_revision: The specific tokenizer version to use. It can be a
- branch name, a tag name, or a commit id. If unspecified, will use
- the default version.
- max_model_len: Maximum length of a sequence (including prompt and
- output). If None, will be derived from the model.
- quantization: Quantization method that was used to quantize the model
- weights. If None, we assume the model weights are not quantized.
- load_in_4bit: Whether to load the FP16 model in bitsandbytes 4bit
- format. Works with AWQ models as well as FP16.
- load_in_8bit: Whether to load the FP16 model in 8bit format. Slower
- than load_in_smooth in terms of throughput.
- load_in_smooth: Whether to load the FP16 model in smoothquant format.
- quantization_param_path: Path to JSON file containing scaling factors.
- Used to load KV cache scaling factors into the model when KV cache
- type is FP8_E4M3 on ROCm (AMD GPU). In the future these will also
- be used to load activation and weight scaling factors when the
- model dtype is FP8_E4M3 on ROCm.
- enforce_eager: Whether to enforce eager execution. If True, we will
- disable CUDA graph and always execute the model in eager mode.
- If False, we will use CUDA graph and eager execution in hybrid.
- max_context_len_to_capture: Maximum context len covered by CUDA graphs.
- When a sequence has context length larger than this, we fall back
- to eager mode.
- """
- def __init__(
- self,
- model: str,
- tokenizer: str,
- tokenizer_mode: str,
- trust_remote_code: bool,
- download_dir: Optional[str],
- load_format: str,
- # dtype: str,
- dtype: Union[str, torch.dtype],
- seed: int,
- revision: Optional[str] = None,
- code_revision: Optional[str] = None,
- tokenizer_revision: Optional[str] = None,
- max_model_len: Optional[int] = None,
- quantization: Optional[str] = None,
- load_in_4bit: bool = False,
- load_in_8bit: bool = False,
- load_in_smooth: bool = False,
- quantization_param_path: Optional[str] = None,
- enforce_eager: bool = True,
- max_context_len_to_capture: Optional[int] = None,
- max_log_probs: int = 10,
- ) -> None:
- self.model = model
- self.tokenizer = tokenizer
- self.tokenizer_mode = tokenizer_mode
- self.trust_remote_code = trust_remote_code
- self.download_dir = download_dir
- self.load_format = load_format
- self.seed = seed
- self.revision = revision
- self.code_revision = code_revision
- self.tokenizer_revision = tokenizer_revision
- self.quantization = quantization
- self.load_in_4bit = load_in_4bit
- self.load_in_8bit = load_in_8bit
- self.load_in_smooth = load_in_smooth
- self.quantization_param_path = quantization_param_path
- self.enforce_eager = enforce_eager
- self.max_context_len_to_capture = max_context_len_to_capture
- self.max_log_probs = max_log_probs
- if os.environ.get("APHRODITE_USE_MODELSCOPE",
- "False").lower() == "true":
- # download model from ModelScope hub,
- # lazy import so that modelscope is not required for normal use.
- from modelscope.hub.snapshot_download import snapshot_download # pylint: disable=C
- if not os.path.exists(model):
- model_path = snapshot_download(model_id=model,
- cache_dir=download_dir,
- revision=revision)
- else:
- model_path = model
- self.model = model_path
- self.download_dir = model_path
- self.tokenizer = model_path
- self.hf_config = get_config(self.model, trust_remote_code, revision,
- code_revision)
- self.hf_text_config = get_hf_text_config(self.hf_config)
- self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
- self.max_model_len = _get_and_verify_max_len(self.hf_text_config,
- max_model_len)
- self._verify_load_format()
- self._verify_tokenizer_mode()
- self._verify_quantization()
- self._verify_cuda_graph()
- def _verify_load_format(self) -> None:
- load_format = self.load_format.lower()
- supported_load_format = [
- "auto", "pt", "safetensors", "npcache", "dummy"
- ]
- rocm_not_supported_load_format = []
- if load_format not in supported_load_format:
- raise ValueError(
- f"Unknown load format: {self.load_format}. Must be one of "
- "'auto', 'pt', 'safetensors', 'npcache', or 'dummy'.")
- if is_hip() and load_format in rocm_not_supported_load_format:
- rocm_supported_load_format = [
- f for f in supported_load_format
- if (f not in rocm_not_supported_load_format)
- ]
- raise ValueError(
- f"load format \'{load_format}\' is not supported in ROCm. "
- f"Supported load format are "
- f"{rocm_supported_load_format}")
- # TODO: Remove this check once HF updates the pt weights of Mixtral.
- architectures = getattr(self.hf_config, "architectures", [])
- # architectures can be None instead of []
- if architectures and "MixtralForCausalLM" in architectures \
- and load_format == "pt":
- raise ValueError(
- "Currently, the 'pt' format is not supported for Mixtral. "
- "Please use the 'safetensors' format instead. ")
- self.load_format = load_format
- def _verify_tokenizer_mode(self) -> None:
- tokenizer_mode = self.tokenizer_mode.lower()
- if tokenizer_mode not in ["auto", "slow"]:
- raise ValueError(
- f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be "
- "either 'auto' or 'slow'.")
- self.tokenizer_mode = tokenizer_mode
- def _verify_quantization(self) -> None:
- supported_quantization = [
- "aqlm", "awq", "bnb", "eetq", "exl2", "gguf", "gptq", "quip",
- "squeezellm", "marlin"
- ]
- rocm_not_supported_quantization = ["aqlm", "awq", "bnb", "quip"]
- if self.quantization is not None:
- self.quantization = self.quantization.lower()
- if self.model.endswith("gguf"):
- if self.quantization is None:
- self.quantization = "gguf"
- elif self.quantization != "gguf":
- raise ValueError(
- f"GGUF file cannot be used in ({self.quantization}).")
- # Parse quantization method from the HF model config, if available.
- hf_quant_config = getattr(self.hf_config, "quantization_config", None)
- if hf_quant_config is not None:
- hf_quant_method = str(hf_quant_config["quant_method"]).lower()
- # If the GPTQ model is serialized in marlin format, use marlin.
- if (hf_quant_method == "gptq"
- and "is_marlin_format" in hf_quant_config
- and hf_quant_config["is_marlin_format"]):
- hf_quant_method = "marlin"
- if self.quantization is None:
- self.quantization = hf_quant_method
- elif self.quantization != hf_quant_method:
- raise ValueError(
- "Quantization method specified in the model config "
- f"({hf_quant_method}) does not match the quantization "
- f"method specified in the `quantization` argument "
- f"({self.quantization}).")
- if self.load_in_4bit:
- # the kernels seem to not work with 4bit weight_only
- if torch.cuda.get_device_capability(0)[0] < 8:
- raise ValueError(
- "load_in_4bit quantization is not supported on GPUs with "
- "compute capability less than 8.0.")
- if self.quantization is None:
- self.quantization = "bnb"
- self.hf_config.quantization_config = {
- "bits": 4,
- "quant_mode": "weight_only",
- "quant_method": "bnb",
- "group_size": 128,
- "zero_point": True,
- "from_float": True
- }
- elif self.quantization == "awq":
- logger.warning("AWQ model is being loaded in 4bit bnb format.")
- self.quantization = "bnb"
- self.hf_config.quantization_config = {
- "zero_point": True,
- "q_group_size": 128,
- "w_bit": 4,
- "version": "gemm"
- }
- elif self.quantization != "bnb":
- raise ValueError("4bit quantization is not supported in "
- f"{self.quantization}.")
- if self.load_in_8bit:
- if self.quantization is None:
- self.quantization = "bnb"
- elif self.quantization != "bnb":
- raise ValueError("8bit quantization is not supported in "
- f"{self.quantization}.")
- self.hf_config.quantization_config = {
- "bits": 8,
- "quant_mode": "llm_int8",
- "quant_method": "bnb",
- "group_size": 128,
- "zero_point": True,
- "from_float": True
- }
- self.enforce_eager = True
- if self.load_in_smooth:
- if self.quantization is None:
- self.quantization = "bnb"
- elif self.quantization != "bnb":
- raise ValueError("Smooth quantization is not supported in "
- f"{self.quantization}.")
- self.hf_config.quantization_config = {
- "bits": 8,
- "quant_mode": "smoothquant",
- "quant_method": "bnb",
- "group_size": 128,
- "zero_point": True,
- "from_float": True
- }
- self.enforce_eager = True
- if self.quantization is not None:
- if self.quantization not in supported_quantization:
- raise ValueError(
- f"Unknown quantization method: {self.quantization}. Must "
- f"be one of {supported_quantization}.")
- if is_hip(
- ) and self.quantization in rocm_not_supported_quantization:
- raise ValueError(
- f"{self.quantization} quantization is currently not "
- "supported in ROCm.")
- if self.quantization != "marlin":
- logger.warning(
- f"{self.quantization} quantization is not fully "
- "optimized yet. The speed can be slower than "
- "non-quantized models.")
- def _verify_cuda_graph(self) -> None:
- if self.max_context_len_to_capture is None:
- self.max_context_len_to_capture = self.max_model_len
- self.max_context_len_to_capture = min(self.max_context_len_to_capture,
- self.max_model_len)
- def verify_with_parallel_config(
- self,
- parallel_config: "ParallelConfig",
- ) -> None:
- total_num_attention_heads = self.hf_text_config.num_attention_heads
- tensor_parallel_size = parallel_config.tensor_parallel_size
- if total_num_attention_heads % tensor_parallel_size != 0:
- raise ValueError(
- f"Total number of attention heads ({total_num_attention_heads})"
- " must be divisible by tensor parallel size "
- f"({tensor_parallel_size}).")
- total_num_hidden_layers = self.hf_text_config.num_hidden_layers
- pipeline_parallel_size = parallel_config.pipeline_parallel_size
- if total_num_hidden_layers % pipeline_parallel_size != 0:
- raise ValueError(
- f"Total number of hidden layers ({total_num_hidden_layers}) "
- "must be divisible by pipeline parallel size "
- f"({pipeline_parallel_size}).")
- def get_sliding_window(self) -> Optional[int]:
- if (hasattr(self.hf_text_config, "use_sliding_window")
- and not self.hf_text_config.use_sliding_window):
- return None
- return getattr(self.hf_text_config, "sliding_window", None)
- def get_vocab_size(self) -> int:
- return self.hf_text_config.vocab_size
- def get_hidden_size(self) -> int:
- return self.hf_text_config.hidden_size
- def get_head_size(self) -> int:
- if hasattr(self.hf_config, "head_dim"):
- return self.hf_config.head_dim
- # FIXME: This may not be true for all models.
- return (self.hf_text_config.hidden_size //
- self.hf_text_config.num_attention_heads)
- def get_total_num_kv_heads(self) -> int:
- """Returns the total number of KV heads."""
- # For GPTBigCode & Falcon:
- # NOTE: for falcon, when new_decoder_architecture is True, the
- # multi_query flag is ignored and we use n_head_kv for the number of
- # KV heads.
- falcon_model_types = ["falcon", "RefinedWeb", "RefinedWebModel"]
- new_decoder_arch_falcon = (
- self.hf_config.model_type in falcon_model_types
- and getattr(self.hf_config, "new_decoder_architecture", False))
- if not new_decoder_arch_falcon and getattr(self.hf_text_config,
- "multi_query", False):
- # Multi-query attention, only one KV head.
- # Currently, tensor parallelism is not supported in this case.
- return 1
- # For DBRX and MPT
- if self.hf_config.model_type in ["dbrx", "mpt"]:
- return getattr(self.hf_config.attn_config, "kv_n_heads",
- self.hf_config.num_attention_heads)
- attributes = [
- # For Falcon:
- "n_head_kv",
- "num_kv_heads",
- # For LLaMA-2:
- "num_key_value_heads",
- # For ChatGLM:
- "multi_query_group_num",
- ]
- for attr in attributes:
- num_kv_heads = getattr(self.hf_text_config, attr, None)
- if num_kv_heads is not None:
- return num_kv_heads
- # For non-grouped-query attention models, the number of KV heads is
- # equal to the number of attention heads.
- return self.hf_text_config.num_attention_heads
- def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int:
- """Returns the number of KV heads per GPU."""
- total_num_kv_heads = self.get_total_num_kv_heads()
- # If tensor parallelism is used, we divide the number of KV heads by
- # the tensor parallel size. We will replicate the KV heads in the
- # case where the number of KV heads is smaller than the tensor
- # parallel size so each GPU has at least one KV head.
- return max(1,
- total_num_kv_heads // parallel_config.tensor_parallel_size)
- def get_num_layers(self, parallel_config: "ParallelConfig") -> int:
- total_num_hidden_layers = self.hf_text_config.num_hidden_layers
- return total_num_hidden_layers // parallel_config.pipeline_parallel_size
- class CacheConfig:
- """Configuration for the KV cache.
- Args:
- block_size: Size of a cache block in number of tokens.
- gpu_memory_utilization: Fraction of GPU memory to use for the
- Aphrodite execution.
- swap_space: Size of the CPU swap space per GPU (in GiB).
- cache_dtype: Data Type for KV cache storage.
- cache_quant_params_path: Path to the scales and zero points
- of KV cache quantization when cache_dtype is int8.
- num_gpu_blocks_override: Number of GPU blocks to use. This overrides
- the profiled num_gpu_blocks if specified. Does nothing if None.
- """
- def __init__(
- self,
- block_size: int,
- gpu_memory_utilization: float,
- swap_space: int,
- cache_dtype: str,
- # cache_quant_params_path: Optional[str] = None,
- num_gpu_blocks_override: Optional[int] = None,
- sliding_window: Optional[int] = None,
- context_shift: bool = False,
- ) -> None:
- self.block_size = block_size
- self.gpu_memory_utilization = gpu_memory_utilization
- self.swap_space_bytes = swap_space * _GB
- self.num_gpu_blocks_override = num_gpu_blocks_override
- self.cache_dtype = cache_dtype
- self.sliding_window = sliding_window
- # self.cache_quant_params_path = cache_quant_params_path
- self.context_shift = context_shift
- self._verify_args()
- self._verify_cache_dtype()
- # Will be set after profiling.
- self.num_gpu_blocks = None
- self.num_cpu_blocks = None
- def metrics_info(self):
- # convert cache_config to dict(key: str, value: str) for prometheus
- # metrics info
- return {key: str(value) for key, value in self.__dict__.items()}
- def _verify_args(self) -> None:
- if self.gpu_memory_utilization > 1.0:
- raise ValueError(
- "GPU memory utilization must be less than 1.0. Got "
- f"{self.gpu_memory_utilization}.")
- def _verify_cache_dtype(self) -> None:
- if self.cache_dtype == "auto":
- # if self.cache_dtype in ["auto", "int8"]:
- pass
- elif self.cache_dtype == "fp8":
- if not is_hip():
- nvcc_cuda_version = get_nvcc_cuda_version()
- if nvcc_cuda_version and nvcc_cuda_version < Version("11.8"):
- raise ValueError(
- "FP8 is not supported when cuda version is"
- "lower than 11.8.")
- logger.info(
- "Using fp8 data type to store kv cache. It reduces the GPU "
- "memory footprint and boosts the performance. "
- "But it may cause slight accuracy drop without scaling "
- "factors. FP8_E5M2 (without scaling) is only supported on "
- "cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 "
- "is instead supported for common inference criteria.")
- else:
- raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}")
- def verify_with_parallel_config(
- self,
- parallel_config: "ParallelConfig",
- ) -> None:
- total_cpu_memory = get_cpu_memory()
- # FIXME: Here, it is assumed that the GPUs in a tensor parallel
- # group are in the same node. However, the GPUs may span multiple nodes.
- num_gpus_per_node = parallel_config.tensor_parallel_size
- cpu_memory_usage = self.swap_space_bytes * num_gpus_per_node
- msg = (f"{cpu_memory_usage / _GB:.2f} GiB out of "
- f"the {total_cpu_memory / _GB:.2f} GiB total CPU memory is "
- "allocated for the swap space.")
- if cpu_memory_usage > 0.7 * total_cpu_memory:
- raise ValueError("Too large swap space. " + msg)
- elif cpu_memory_usage > 0.4 * total_cpu_memory:
- logger.warning("Possibly too large swap space. " + msg)
- @dataclass
- class TokenizerPoolConfig:
- """Configuration for the tokenizer pool.
-
- Args:
- pool_size: Number of tokenizer instances in the pool.
- pool_type: Type of the tokenizer pool.
- extra_config: Additional config for the pool.
- The way the config will be used depends on the
- pool type.
- """
- pool_size: int
- pool_type: str
- extra_config: dict
- def __post_init__(self):
- if self.pool_type not in ("ray", ):
- raise ValueError(f"Unknown pool type: {self.pool_type}.")
- if not isinstance(self.extra_config, dict):
- raise ValueError("extra_config must be a dictionary.")
- @classmethod
- def create_config(
- cls, tokenizer_pool_size: int, tokenizer_pool_type: str,
- tokenizer_pool_extra_config: Optional[Union[str, dict]]
- ) -> Optional["TokenizerPoolConfig"]:
- """Create a TokenizerPoolConfig from the given parameters.
-
- If tokenizer_pool_size is 0, return None.
-
- Args:
- tokenizer_pool_size: Number of tokenizer workers in the pool.
- tokenizer_pool_type: Type of the tokenizer pool.
- tokenizer_pool_extra_config: Additional config for the pool.
- The way the config will be used depends on the pool type.
- """
- if tokenizer_pool_size:
- if isinstance(tokenizer_pool_extra_config, str):
- tokenizer_pool_extra_config_parsed = json.loads(
- tokenizer_pool_extra_config)
- else:
- tokenizer_pool_extra_config_parsed = (
- tokenizer_pool_extra_config or {})
- tokenizer_pool_config = cls(tokenizer_pool_size,
- tokenizer_pool_type,
- tokenizer_pool_extra_config_parsed)
- else:
- tokenizer_pool_config = None
- return tokenizer_pool_config
- class ParallelConfig:
- """Configuration for the distributed execution.
- Args:
- pipeline_parallel_size: Number of pipeline parallel groups.
- tensor_parallel_size: Number of tensor parallel groups.
- worker_use_ray: Whether to use Ray for model workers. Will be set to
- True if either pipeline_parallel_size or tensor_parallel_size is
- greater than 1.
- max_parallel_loading_workers: Maximum number of multiple batches
- when load model sequentially. To avoid RAM OOM when using tensor
- parallel and large models.
- disable_custom_all_reduce: Disable the custom all-reduce kernel and
- fall back to NCCL.
- tokenizer_pool_config: Configuration for the tokenizer pool.
- If None, will use synchronous tokenization.
- ray_workers_use_nsight: Whether to profile Ray workers with nsight, see
- https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler.
- """
- def __init__(
- self,
- pipeline_parallel_size: int,
- tensor_parallel_size: int,
- worker_use_ray: bool,
- max_parallel_loading_workers: Optional[int] = None,
- disable_custom_all_reduce: bool = False,
- tokenizer_pool_config: Optional[TokenizerPoolConfig] = None,
- ray_workers_use_nsight: bool = False,
- placement_group: Optional["PlacementGroup"] = None,
- ) -> None:
- self.pipeline_parallel_size = pipeline_parallel_size
- self.tensor_parallel_size = tensor_parallel_size
- self.worker_use_ray = worker_use_ray
- self.max_parallel_loading_workers = max_parallel_loading_workers
- self.disable_custom_all_reduce = disable_custom_all_reduce
- self.tokenizer_pool_config = tokenizer_pool_config
- self.ray_workers_use_nsight = ray_workers_use_nsight
- self.placement_group = placement_group
- self.world_size = pipeline_parallel_size * self.tensor_parallel_size
- if self.world_size > 1:
- self.worker_use_ray = True
- self._verify_args()
- def _verify_args(self) -> None:
- if self.pipeline_parallel_size > 1:
- raise NotImplementedError(
- "Pipeline parallelism is not supported yet.")
- if not self.disable_custom_all_reduce and self.world_size > 1:
- if is_hip():
- self.disable_custom_all_reduce = True
- logger.info(
- "Disabled the custom all-reduce kernel because it is not "
- "supported on AMD GPUs.")
- elif self.pipeline_parallel_size > 1:
- self.disable_custom_all_reduce = True
- logger.info(
- "Disabled the custom all-reduce kernel because it is not "
- "supported with pipeline parallelism.")
- if self.ray_workers_use_nsight and not self.worker_use_ray:
- raise ValueError("Unable to use nsight profiling unless workers "
- "run with Ray.")
- class SchedulerConfig:
- """Scheduler configuration.
- Args:
- max_num_batched_tokens: Maximum number of tokens to be processed in
- a single iteration.
- max_num_seqs: Maximum number of sequences to be processed in a single
- iteration.
- max_model_len: Maximum length of a sequence (including prompt
- and generated text).
- use_v2_block_manager: Whether to use the BlockSpaceManagerV2 or not.
- num_lookahead_slots: The number of slots to allocate per sequence per
- step, beyond the known token ids. This is used in speculative
- decoding to store KV activations of tokens which may or may not be
- accepted.
- delay_factor: Apply a delay (of delay factor multiplied by previous
- prompt latency) before scheduling the next prompt.
- policy: Policy of sequence scheduling (`fcfs` or `reorder`).
- reorder_window: Allowed reorder window size (in sec) for `reorder`
- policy.
- enable_chunked_prefill: If True, prefill requests can be chunked
- based on the remaining max_num_batched_tokens.
- """
- def __init__(
- self,
- max_num_batched_tokens: Optional[int],
- max_num_seqs: int,
- max_model_len: int,
- use_v2_block_manager: bool = False,
- num_lookahead_slots: int = 0,
- delay_factor: float = 0.0,
- policy: str = "fcfs",
- reorder_window: float = 0.0,
- enable_chunked_prefill: bool = False,
- ) -> None:
- if max_num_batched_tokens is not None:
- self.max_num_batched_tokens = max_num_batched_tokens
- else:
- if enable_chunked_prefill:
- # For chunked prefill, choose the well-tuned batch size.
- self.max_num_batched_tokens = 768
- else:
- # If max_model_len is too short, use 2048 as the default value
- # for higher throughput.
- self.max_num_batched_tokens = max(max_model_len, 2048)
- if enable_chunked_prefill:
- logger.info("Chunked prefill is enabled (EXPERIMENTAL).")
- self.max_num_seqs = max_num_seqs
- self.max_model_len = max_model_len
- self.use_v2_block_manager = use_v2_block_manager
- self.num_lookahead_slots = num_lookahead_slots
- self.delay_factor = delay_factor
- self.policy = policy
- self.reorder_window = reorder_window
- self.chunked_prefill_enabled = enable_chunked_prefill
- self._verify_args()
- def _verify_args(self) -> None:
- if (self.max_num_batched_tokens < self.max_model_len
- and not self.chunked_prefill_enabled):
- raise ValueError(
- f"max_num_batched_tokens ({self.max_num_batched_tokens}) is "
- f"smaller than max_model_len ({self.max_model_len}). "
- "This effectively limits the maximum sequence length to "
- "max_num_batched_tokens and makes Aphrodite reject longer "
- "sequences. Please increase max_num_batched_tokens or "
- "decrease max_model_len.")
- if self.max_num_batched_tokens < self.max_num_seqs:
- raise ValueError(
- f"max_num_batched_tokens ({self.max_num_batched_tokens}) must "
- "be greater than or equal to max_num_seqs "
- f"({self.max_num_seqs}).")
- if self.reorder_window < 0:
- raise ValueError(f"reorder_window ({self.reorder_window}) must "
- "be not be negative.")
- if self.reorder_window != 0 and self.policy != 'reorder':
- raise ValueError("fcfs policy doesn't support reorder_window "
- f"({self.reorder_window}).")
- if self.num_lookahead_slots < 0:
- raise ValueError(
- "num_lookahead_slots "
- f"({self.num_lookahead_slots}) must be greater than or "
- "equal to 0.")
- class DeviceConfig:
- def __init__(self, device: str = "auto") -> None:
- if device == "auto":
- # Automated device type detection
- if torch.cuda.is_available():
- self.device_type = "cuda"
- elif is_neuron():
- self.device_type = "neuron"
- elif is_cpu():
- self.device_type = "cpu"
- else:
- raise RuntimeError("No supported device detected.")
- else:
- # Device type is assigned explicitly
- self.device_type = device
- # Some device types require processing inputs on CPU
- if self.device_type in ["neuron"]:
- self.device = torch.device("cpu")
- else:
- # Set device with device type
- self.device = torch.device(self.device_type)
- class SpeculativeConfig:
- """Configuration for speculative decoding.
- The configuration is currently specialized to draft-model speculative
- decoding with top-1 proposals.
- """
- @staticmethod
- def maybe_create_spec_config(
- target_model_config: ModelConfig,
- target_parallel_config: ParallelConfig,
- target_dtype: str,
- speculative_model: Optional[str],
- num_speculative_tokens: Optional[int],
- speculative_max_model_len: Optional[int],
- enable_chunked_prefill: bool,
- use_v2_block_manager: bool,
- ngram_prompt_lookup_max: Optional[int],
- ngram_prompt_lookup_min: Optional[int],
- ) -> Optional["SpeculativeConfig"]:
- """Create a SpeculativeConfig if possible, else return None.
- This function attempts to create a SpeculativeConfig object based on the
- provided parameters. If the necessary conditions are met, it returns an
- instance of SpeculativeConfig. Otherwise, it returns None.
- Args:
- target_model_config (ModelConfig): The configuration of the target
- model.
- target_parallel_config (ParallelConfig): The parallel configuration
- for the target model.
- target_dtype (str): The data type used for the target model.
- speculative_model (Optional[str]): The name of the speculative
- model, if provided.
- num_speculative_tokens (Optional[int]): The number of speculative
- tokens, if provided.
- speculative_max_model_len (Optional[int]): The maximum model len of
- the speculative model. Used when testing the ability to skip
- speculation for some sequences.
- enable_chunked_prefill (bool): Whether Aphrodite is configured to
- use chunked prefill or not. Used for raising an error since its
- not yet compatible with spec decode.
- use_v2_block_manager (bool): Whether Aphrodite is configured to
- use the v2 block manager or not. Used for raising an error
- since the v2 block manager is required with spec decode.
- ngram_prompt_lookup_max (Optional[int]): Max size of ngram token
- window, if provided.
- ngram_prompt_lookup_min (Optional[int]): Min size of ngram token
- window, if provided.
- Returns:
- Optional["SpeculativeConfig"]: An instance of SpeculativeConfig if
- the necessary conditions are met, else None.
- """
- if (speculative_model is None and num_speculative_tokens is None):
- return None
- if speculative_model is not None and num_speculative_tokens is None:
- raise ValueError(
- "Expected both speculative_model and "
- "num_speculative_tokens to be provided, but found "
- f"{speculative_model=} and {num_speculative_tokens=}.")
- assert (speculative_model is not None
- and num_speculative_tokens is not None)
- if enable_chunked_prefill:
- raise ValueError(
- "Speculative decoding and chunked prefill are "
- f"currently mutually exclusive ({enable_chunked_prefill=}).")
- if not use_v2_block_manager:
- raise ValueError(
- "Speculative decoding requires usage of the V2 "
- "block manager. Enable it with --use-v2-block-manager.")
- # TODO: The user should be able to specify revision/quantization/max
- # model len for the draft model. It is not currently supported.
- draft_revision = None
- draft_code_revision = None
- draft_quantization = None
- if speculative_model == "[ngram]":
- assert (ngram_prompt_lookup_max is not None
- and ngram_prompt_lookup_max > 0)
- if ngram_prompt_lookup_min is None:
- ngram_prompt_lookup_min = 0
- else:
- assert ngram_prompt_lookup_max > ngram_prompt_lookup_min
- draft_model_config = target_model_config
- draft_parallel_config = target_parallel_config
- else:
- ngram_prompt_lookup_max = 0
- ngram_prompt_lookup_min = 0
- draft_model_config = ModelConfig(
- model=speculative_model,
- download_dir=target_model_config.download_dir,
- load_format=target_model_config.load_format,
- tokenizer=target_model_config.tokenizer,
- tokenizer_mode=target_model_config.tokenizer_mode,
- trust_remote_code=target_model_config.trust_remote_code,
- dtype=target_model_config.dtype,
- seed=target_model_config.seed,
- revision=draft_revision,
- code_revision=draft_code_revision,
- tokenizer_revision=target_model_config.tokenizer_revision,
- max_model_len=None,
- quantization=draft_quantization,
- enforce_eager=target_model_config.enforce_eager,
- max_context_len_to_capture=target_model_config.
- max_context_len_to_capture,
- max_log_probs=target_model_config.max_log_probs,
- )
- draft_model_config.max_model_len = (
- SpeculativeConfig._maybe_override_draft_max_model_len(
- speculative_max_model_len,
- draft_model_config.max_model_len,
- target_model_config.max_model_len,
- ))
- draft_parallel_config = (
- SpeculativeConfig.create_draft_parallel_config(
- target_parallel_config))
- return SpeculativeConfig(
- draft_model_config,
- draft_parallel_config,
- num_speculative_tokens,
- ngram_prompt_lookup_max,
- ngram_prompt_lookup_min,
- )
- @staticmethod
- def _maybe_override_draft_max_model_len(
- speculative_max_model_len: Optional[int],
- draft_max_model_len: int,
- target_max_model_len: int,
- ) -> int:
- """Determine the max sequence len for the draft model. This is usually
- the draft_max_model_len, but may be the target_max_model_len if it is
- less than the draft_max_model_len, or may be speculative_max_model_len
- if it is specified.
- This is necessary so that sequences do not exceed the capacity of the
- draft model or the target model.
- speculative_max_model_len is mainly used for testing that sequences can
- skip speculation.
- """
- if speculative_max_model_len is not None:
- if speculative_max_model_len > draft_max_model_len:
- raise ValueError(f"{speculative_max_model_len=} cannot be "
- f"larger than {draft_max_model_len=}")
- if speculative_max_model_len > target_max_model_len:
- raise ValueError(f"{speculative_max_model_len=} cannot be "
- f"larger than {target_max_model_len=}")
- return speculative_max_model_len
- return min(
- draft_max_model_len,
- target_max_model_len,
- )
- @staticmethod
- def create_draft_parallel_config(
- target_parallel_config: ParallelConfig) -> ParallelConfig:
- """Create a parallel config for use by the draft worker.
- This is mostly a copy of the target parallel config. In the future the
- draft worker can have a different parallel strategy, e.g. TP=1.
- """
- draft_parallel_config = ParallelConfig(
- pipeline_parallel_size=target_parallel_config.
- pipeline_parallel_size,
- tensor_parallel_size=target_parallel_config.tensor_parallel_size,
- worker_use_ray=target_parallel_config.worker_use_ray,
- max_parallel_loading_workers=target_parallel_config.
- max_parallel_loading_workers,
- disable_custom_all_reduce=target_parallel_config.
- disable_custom_all_reduce,
- tokenizer_pool_config=target_parallel_config.tokenizer_pool_config,
- ray_workers_use_nsight=target_parallel_config.
- ray_workers_use_nsight,
- placement_group=target_parallel_config.placement_group,
- )
- return draft_parallel_config
- def __init__(
- self,
- draft_model_config: ModelConfig,
- draft_parallel_config: ParallelConfig,
- num_speculative_tokens: int,
- ngram_prompt_lookup_max: int,
- ngram_prompt_lookup_min: int,
- ):
- """Create a SpeculativeConfig object.
- Args:
- draft_model_config: ModelConfig for the draft model.
- draft_parallel_config: ParallelConfig for the draft model.
- num_speculative_tokens: The number of tokens to sample from the
- draft model before scoring with the target model.
- """
- self.draft_model_config = draft_model_config
- self.draft_parallel_config = draft_parallel_config
- self.num_speculative_tokens = num_speculative_tokens
- self.ngram_prompt_lookup_max = ngram_prompt_lookup_max
- self.ngram_prompt_lookup_min = ngram_prompt_lookup_min
- self._verify_args()
- def _verify_args(self) -> None:
- if self.num_speculative_tokens <= 0:
- raise ValueError("Expected num_speculative_tokens to be greater "
- f"than zero ({self.num_speculative_tokens}).")
- if self.draft_model_config:
- self.draft_model_config.verify_with_parallel_config(
- self.draft_parallel_config)
- @property
- def num_lookahead_slots(self) -> int:
- """The number of additional slots the scheduler should allocate per
- step, in addition to the slots allocated for each known token.
- This is equal to the number of speculative tokens, as each speculative
- token must be scored.
- """
- return self.num_speculative_tokens
- def __repr__(self) -> str:
- if self.ngram_prompt_lookup_max > 0:
- draft_model = "[ngram]"
- else:
- draft_model = self.draft_model_config.model
- num_spec_tokens = self.num_speculative_tokens
- return f"SpeculativeConfig({draft_model=}, {num_spec_tokens=})"
- @dataclass
- class LoRAConfig:
- max_lora_rank: int
- max_loras: int
- max_cpu_loras: Optional[int] = None
- lora_dtype: Optional[torch.dtype] = None
- lora_extra_vocab_size: int = 256
- # This is a constant.
- lora_vocab_padding_size: ClassVar[int] = 256
- def __post_init__(self):
- # Keep this in sync with kernels/punica/bgmv/bgmv_config.h
- possible_max_ranks = (8, 16, 32, 64)
- possible_lora_extra_vocab_size = (0, 256, 512)
- if self.max_lora_rank not in possible_max_ranks:
- raise ValueError(
- f"max_lora_rank ({self.max_lora_rank}) must be one of "
- f"{possible_max_ranks}.")
- if self.lora_extra_vocab_size not in possible_lora_extra_vocab_size:
- raise ValueError(
- f"lora_extra_vocab_size ({self.lora_extra_vocab_size}) "
- f"must be one of {possible_lora_extra_vocab_size}.")
- if self.max_loras < 1:
- raise ValueError(f"max_loras ({self.max_loras}) must be >= 1.")
- if self.max_cpu_loras is None:
- self.max_cpu_loras = self.max_loras
- elif self.max_cpu_loras < self.max_loras:
- raise ValueError(
- f"max_cpu_loras ({self.max_cpu_loras}) must be >= "
- f"max_loras ({self.max_loras})")
- def verify_with_model_config(self, model_config: ModelConfig):
- if self.lora_dtype in (None, "auto"):
- self.lora_dtype = model_config.dtype
- elif isinstance(self.lora_dtype, str):
- self.lora_dtype = getattr(torch, self.lora_dtype)
- if (model_config.quantization is not None
- and model_config.quantization == "gguf"):
- raise ValueError("LoRA is not supported with GGUF quantization.")
- def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
- if scheduler_config.max_num_batched_tokens > 65528:
- raise ValueError(
- "Due to limitations of the custom LoRA CUDA kernel, "
- "max_num_batched_tokens must be <= 65528 when "
- "LoRA is enabled.")
- @dataclass
- class VisionLanguageConfig:
- """Configs the input data format and how models should run for
- vision language models."""
- class ImageInputType(enum.Enum):
- """Image input type into the vision language model.
- An image roughly goes through the following transformation:
- Raw image --> pixel values --> image features --> image embeddings.
- The difference between different image input types is where the
- image encoder (pixel values --> image features) is run.
- Different image input types also correspond to different tensor shapes.
- For example, for Llava, PIXEL_VALUES: (1, 3, 336, 336).
- IMAGE_FEATURES: (1, 576, 1024).
- """
- PIXEL_VALUES = enum.auto()
- IMAGE_FEATURES = enum.auto()
- image_input_type: ImageInputType
- # The input id corresponding to image token.
- image_token_id: int
- # Used for running `run_prefill_max_token`.
- # For models that support varying resolution, this corresponds to
- # worst case scenario (biggest supported resolution).
- image_input_shape: tuple
- image_feature_size: int
- @classmethod
- def get_image_input_enum_type(
- cls, value: str) -> "VisionLanguageConfig.ImageInputType":
- """Get the image input type from a string."""
- try:
- return cls.ImageInputType[value.upper()]
- except KeyError as e:
- raise ValueError(f"{value} is not a valid choice. "
- f"Expecting to choose from "
- f"{[x.name for x in cls.ImageInputType]}.") from e
- _STR_DTYPE_TO_TORCH_DTYPE = {
- "half": torch.float16,
- "float16": torch.float16,
- "float": torch.float32,
- "float32": torch.float32,
- "bfloat16": torch.bfloat16,
- }
- _ROCM_NOT_SUPPORTED_DTYPE = ["float", "float32"]
- def _get_and_verify_dtype(
- config: PretrainedConfig,
- dtype: Union[str, torch.dtype],
- ) -> torch.dtype:
- # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
- # because config.torch_dtype can be None.
- config_dtype = getattr(config, "torch_dtype", None)
- if config_dtype is None:
- config_dtype = torch.float32
- if isinstance(dtype, str):
- dtype = dtype.lower()
- if dtype == "auto":
- if config_dtype == torch.float32:
- # Following the common practice, we use float16 for float32
- # models.
- torch_dtype = torch.float16
- else:
- torch_dtype = config_dtype
- else:
- if dtype not in _STR_DTYPE_TO_TORCH_DTYPE:
- raise ValueError(f"Unknown dtype: {dtype}")
- torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype]
- elif isinstance(dtype, torch.dtype):
- torch_dtype = dtype
- else:
- raise ValueError(f"Unknown dtype: {dtype}")
- if is_hip() and torch_dtype == torch.float32:
- rocm_supported_dtypes = [
- k for k, v in _STR_DTYPE_TO_TORCH_DTYPE.items()
- if (k not in _ROCM_NOT_SUPPORTED_DTYPE)
- ]
- raise ValueError(f"dtype \'{dtype}\' is not supported in ROCm. "
- f"Supported dtypes are {rocm_supported_dtypes}")
- # Verify the dtype.
- if torch_dtype != config_dtype:
- if torch_dtype == torch.float32:
- # Upcasting to float32 is allowed.
- pass
- elif config_dtype == torch.float32:
- # Downcasting from float32 to float16 or bfloat16 is allowed.
- pass
- else:
- # Casting between float16 and bfloat16 is allowed with a warning.
- logger.warning(f"Casting {config_dtype} to {torch_dtype}.")
- return torch_dtype
- def _get_and_verify_max_len(
- hf_config: PretrainedConfig,
- max_model_len: Optional[int],
- ) -> int:
- """Get and verify the model's maximum length."""
- derived_max_model_len = float("inf")
- possible_keys = [
- # Cohere: needs to prioritize this over "max_position_embeddings"
- "model_max_length",
- # OPT
- "max_position_embeddings",
- # GPT-2
- "n_positions",
- # MPT
- "max_seq_len",
- # ChatGLM2
- "seq_length",
- # Others
- "max_sequence_length",
- "max_seq_length",
- "seq_len",
- ]
- for key in possible_keys:
- max_len_key = getattr(hf_config, key, None)
- if max_len_key is not None:
- derived_max_model_len = min(derived_max_model_len, max_len_key)
- break
- if derived_max_model_len == float("inf"):
- if max_model_len is not None:
- # If max_model_len is specified, we use it.
- return max_model_len
- default_max_len = 2048
- logger.warning(
- "The model's config.json does not contain any of the following "
- "keys to determine the original maximum length of the model: "
- f"{possible_keys}. Assuming the model's maximum length is "
- f"{default_max_len}.")
- derived_max_model_len = default_max_len
- rope_scaling = getattr(hf_config, "rope_scaling", None)
- if rope_scaling is not None:
- assert "factor" in rope_scaling
- scaling_factor = rope_scaling["factor"]
- if rope_scaling["type"] == "yarn":
- derived_max_model_len = rope_scaling[
- "original_max_position_embeddings"]
- derived_max_model_len *= scaling_factor
- if max_model_len is None:
- max_model_len = derived_max_model_len
- elif max_model_len > derived_max_model_len:
- # hope this works
- scaling_factor = max_model_len / derived_max_model_len
- hf_config.rope_scaling = {"factor": scaling_factor, "type": "dynamic"}
- logger.warning(
- f"User-specified max_model_len {max_model_len} is higher than "
- f"the original {derived_max_model_len}. "
- "Attempting to use RoPE scaling.")
- derived_max_model_len = max_model_len
- return int(max_model_len)
- @dataclass
- class DecodingConfig:
- """Dataclass which contains the decoding strategy of the engine"""
- # Which guided decoding algo to use. 'outlines' / 'lm-format-enforcer'
- guided_decoding_backend: str = 'outlines'
- def __post_init__(self):
- valid_guided_backends = ['outlines', 'lm-format-enforcer']
- backend = self.guided_decoding_backend
- if backend not in valid_guided_backends:
- raise ValueError(f"Invalid guided_decoding_backend '{backend}',"
- f"must be one of {valid_guided_backends}")
- @dataclass(frozen=True)
- class EngineConfig:
- """Dataclass which contains all engine-related configuration. This
- simplifies passing around the distinct configurations in the codebase.
- """
- model_config: ModelConfig
- cache_config: CacheConfig
- parallel_config: ParallelConfig
- scheduler_config: SchedulerConfig
- device_config: DeviceConfig
- lora_config: Optional[LoRAConfig]
- vision_language_config: Optional[VisionLanguageConfig]
- speculative_config: Optional[SpeculativeConfig]
- decoding_config: Optional[DecodingConfig]
- def __post_init__(self):
- """Verify configs are valid & consistent with each other.
- """
- self.model_config.verify_with_parallel_config(self.parallel_config)
- self.cache_config.verify_with_parallel_config(self.parallel_config)
- if self.lora_config:
- self.lora_config.verify_with_model_config(self.model_config)
- self.lora_config.verify_with_scheduler_config(
- self.scheduler_config)
- def to_dict(self):
- """Return the configs as a dictionary, for use in **kwargs.
- """
- return dict(
- (field.name, getattr(self, field.name)) for field in fields(self))
|