12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664 |
- import enum
- import json
- import os
- from dataclasses import dataclass, field, fields
- from typing import (TYPE_CHECKING, Any, ClassVar, Dict, List, Optional, Tuple,
- Type, Union)
- import torch
- from loguru import logger
- from transformers import PretrainedConfig
- from aphrodite.common.utils import (cuda_device_count_stateless,
- get_cpu_memory, is_cpu, is_hip, is_neuron,
- is_openvino, is_tpu, is_xpu,
- print_warning_once)
- from aphrodite.distributed import get_current_tp_rank_partition_size
- from aphrodite.modeling.models import ModelRegistry
- from aphrodite.quantization import QUANTIZATION_METHODS
- from aphrodite.transformers_utils.config import get_config, get_hf_text_config
- if TYPE_CHECKING:
- from ray.util.placement_group import PlacementGroup
- from aphrodite.executor.executor_base import ExecutorBase
- from aphrodite.modeling.model_loader.loader import BaseModelLoader
- from aphrodite.transformers_utils.tokenizer_group.base_tokenizer_group import ( # noqa: E501
- BaseTokenizerGroup)
- # If true, will load models from ModelScope instead of Hugging Face Hub.
- APHRODITE_USE_MODELSCOPE = os.environ.get("APHRODITE_USE_MODELSCOPE",
- "False").lower() == "true"
- _GB = 1 << 30
- _EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768
- _PP_SUPPORTED_MODELS = [
- "AquilaModel",
- "AquilaForCausalLM",
- "InternLMForCausalLM",
- "LlamaForCausalLM",
- "LLaMAForCausalLM",
- "MistralForCausalLM",
- "Phi3ForCausalLM",
- "MixtralForCausalLM",
- "NemotronForCausalLM",
- "Qwen2ForCausalLM",
- "Qwen2MoeForCausalLM",
- ]
- _OPTIMIZED_QUANTS = [
- "fp8",
- "marlin",
- "gptq_marlin_24",
- "gptq_marlin",
- "awq_marlin",
- "fbgemm_fp8",
- "compressed-tensors",
- "compressed_tensors",
- ]
- class ModelConfig:
- """Configuration for the model.
- Args:
- model: Name or path of the huggingface model to use.
- It is also used as the content for `model_name` tag in metrics
- output when `served_model_name` is not specified.
- tokenizer: Name or path of the huggingface tokenizer to use.
- tokenizer_mode: Tokenizer mode. "auto" will use the fast tokenizer if
- available, and "slow" will always use the slow tokenizer.
- trust_remote_code: Trust remote code (e.g., from HuggingFace) when
- downloading the model and tokenizer.
- dtype: Data type for model weights and activations. The "auto" option
- will use FP16 precision for FP32 and FP16 models, and BF16 precision
- for BF16 models.
- seed: Random seed for reproducibility.
- revision: The specific model version to use. It can be a branch name,
- a tag name, or a commit id. If unspecified, will use the default
- version.
- code_revision: The specific revision to use for the model code on
- Hugging Face Hub. It can be a branch name, a tag name, or a
- commit id. If unspecified, will use the default version.
- rope_scaling: Dictionary containing the scaling configuration for the
- RoPE embeddings. When using this flag, don't update
- `max_position_embeddings` to the expected new maximum.
- tokenizer_revision: The specific tokenizer version to use. It can be a
- branch name, a tag name, or a commit id. If unspecified, will use
- the default version.
- max_model_len: Maximum length of a sequence (including prompt and
- output). If None, will be derived from the model.
- quantization: Quantization method that was used to quantize the model
- weights. If None, we assume the model weights are not quantized.
- deepspeed_fp_bits: Number of bits to use for DeepSpeed FP quantization.
- Supported number of bits are: 4, 6, 8, 12.
- quantization_param_path: Path to JSON file containing scaling factors.
- Used to load KV cache scaling factors into the model when KV cache
- type is FP8_E4M3 on ROCm (AMD GPU). In the future these will also
- be used to load activation and weight scaling factors when the
- model dtype is FP8_E4M3 on ROCm.
- enforce_eager: Whether to enforce eager execution. If True, we will
- disable CUDA graph and always execute the model in eager mode.
- If False, we will use CUDA graph and eager execution in hybrid.
- max_context_len_to_capture: Maximum context len covered by CUDA graphs.
- When a sequence has context length larger than this, we fall back
- to eager mode (DEPRECATED. Use max_seq_len_to_capture instead).
- max_seq_len_to_capture: Maximum sequence len covered by CUDA graphs.
- When a sequence has context length larger than this, we fall back
- to eager mode
- disable_sliding_window: Whether to disable sliding window. If True,
- we will disable the sliding window functionality of the model.
- If the model does not support sliding window, this argument is
- ignored.
- skip_tokenizer_init: If true, skip initialization of tokenizer and
- detokenizer.
- served_model_name: The model name used in metrics tag `model_name`,
- matches the model name exposed via the APIs. If multiple model
- names provided, the first name will be used. If not specified,
- the model name will be the same as `model`.
- """
- def __init__(
- self,
- model: str,
- tokenizer: str,
- tokenizer_mode: str,
- trust_remote_code: bool,
- dtype: Union[str, torch.dtype],
- seed: int,
- revision: Optional[str] = None,
- code_revision: Optional[str] = None,
- rope_scaling: Optional[dict] = None,
- rope_theta: Optional[float] = None,
- tokenizer_revision: Optional[str] = None,
- max_model_len: Optional[int] = None,
- quantization: Optional[str] = None,
- deepspeed_fp_bits: Optional[int] = None,
- quantization_param_path: Optional[str] = None,
- enforce_eager: bool = True,
- max_context_len_to_capture: Optional[int] = None,
- max_seq_len_to_capture: Optional[int] = None,
- max_logprobs: int = 5,
- disable_sliding_window: bool = False,
- skip_tokenizer_init: bool = False,
- served_model_name: Optional[Union[str, List[str]]] = None,
- multimodal_config: Optional["MultiModalConfig"] = None,
- ) -> None:
- self.model = model
- self.tokenizer = tokenizer
- self.tokenizer_mode = tokenizer_mode
- self.trust_remote_code = trust_remote_code
- self.seed = seed
- self.revision = revision
- self.code_revision = code_revision
- self.rope_scaling = rope_scaling
- self.rope_theta = rope_theta
- # The tokenizer version is consistent with the model version by default.
- if tokenizer_revision is None:
- self.tokenizer_revision = revision
- else:
- self.tokenizer_revision = tokenizer_revision
- self.quantization = quantization
- self.deepspeed_fp_bits = deepspeed_fp_bits
- self.quantization_param_path = quantization_param_path
- self.enforce_eager = enforce_eager
- self.max_context_len_to_capture = max_context_len_to_capture
- if self.max_context_len_to_capture is not None:
- raise ValueError("`max_context_len_to_capture` is deprecated. "
- "Use `max_seq_len_to_capture` instead.")
- self.max_seq_len_to_capture = (max_seq_len_to_capture
- or max_context_len_to_capture)
- self.max_logprobs = max_logprobs
- self.disable_sliding_window = disable_sliding_window
- self.skip_tokenizer_init = skip_tokenizer_init
- self.hf_config = get_config(self.model, trust_remote_code, revision,
- code_revision, rope_scaling, rope_theta)
- self.hf_text_config = get_hf_text_config(self.hf_config)
- self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
- if (not self.disable_sliding_window
- and self.hf_text_config.model_type == "gemma2"
- and self.hf_text_config.sliding_window is not None):
- print_warning_once(
- "Gemma 2 uses sliding window attention for every odd layer, "
- "which is currently not supported by Aphrodite. Disabling "
- "sliding window and capping the max length to the sliding "
- f"window size ({self.hf_text_config.sliding_window}).")
- self.disable_sliding_window = True
- self.max_model_len = _get_and_verify_max_len(
- hf_config=self.hf_text_config,
- max_model_len=max_model_len,
- disable_sliding_window=self.disable_sliding_window,
- sliding_window_len=self.get_hf_config_sliding_window(),
- rope_scaling_arg=self.rope_scaling)
- self.served_model_name = get_served_model_name(model,
- served_model_name)
- self.multimodal_config = multimodal_config
- if not self.skip_tokenizer_init:
- self._verify_tokenizer_mode()
- self._verify_embedding_mode()
- self._verify_quantization()
- self._verify_cuda_graph()
- def _verify_tokenizer_mode(self) -> None:
- tokenizer_mode = self.tokenizer_mode.lower()
- if tokenizer_mode not in ["auto", "slow"]:
- raise ValueError(
- f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be "
- "either 'auto' or 'slow'.")
- self.tokenizer_mode = tokenizer_mode
- def _verify_embedding_mode(self) -> None:
- architectures = getattr(self.hf_config, "architectures", [])
- self.embedding_mode = any(
- ModelRegistry.is_embedding_model(arch) for arch in architectures)
- def _parse_quant_hf_config(self):
- quant_cfg = getattr(self.hf_config, "quantization_config", None)
- if quant_cfg is None:
- # compress-tensors uses a "compression_config" key
- quant_cfg = getattr(self.hf_config, "compression_config", None)
- return quant_cfg
- def _verify_quantization(self) -> None:
- supported_quantization = [*QUANTIZATION_METHODS]
- rocm_supported_quantization = ["gptq", "squeezellm"]
- if self.quantization is not None:
- self.quantization = self.quantization.lower()
- # Parse quantization method from the HF model config, if available.
- quant_cfg = self._parse_quant_hf_config()
- if quant_cfg is not None:
- quant_method = quant_cfg.get("quant_method", "").lower()
- # Detect which checkpoint is it
- for _, method in QUANTIZATION_METHODS.items():
- quantization_override = method.override_quantization_method(
- quant_cfg, self.quantization)
- if quantization_override:
- quant_method = quantization_override
- self.quantization = quantization_override
- break
- # Verify quantization configurations.
- if self.quantization is None:
- self.quantization = quant_method
- elif self.quantization != quant_method:
- raise ValueError(
- "Quantization method specified in the model config "
- f"({quant_method}) does not match the quantization "
- f"method specified in the `quantization` argument "
- f"({self.quantization}).")
- if self.quantization == "deepspeedfp":
- gs = 32 if self.deepspeed_fp_bits == 4 else 128
- self.hf_config.quantization_config = {
- "bits": self.deepspeed_fp_bits,
- "group_size": int(os.environ.get("DEEPSPEED_GROUP_SIZE", gs)),
- "quant_method": "deepspeedfp"
- }
- if self.quantization is not None:
- if self.quantization not in supported_quantization:
- raise ValueError(
- f"Unknown quantization method: {self.quantization}. Must "
- f"be one of {supported_quantization}.")
- if is_hip(
- ) and self.quantization not in rocm_supported_quantization:
- raise ValueError(
- f"{self.quantization} quantization is currently not "
- "supported in ROCm.")
- if self.quantization not in _OPTIMIZED_QUANTS:
- logger.warning(
- f"{self.quantization} quantization is not fully "
- "optimized yet. The speed can be slower than "
- "non-quantized models.")
- if self.quantization == "deepspeedfp" and self.deepspeed_fp_bits \
- is None:
- raise ValueError(
- "deepspeed_fp_bits must be specified when using "
- "deepspeedfp quantization.")
- def _verify_cuda_graph(self) -> None:
- if self.max_seq_len_to_capture is None:
- self.max_seq_len_to_capture = self.max_model_len
- self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
- self.max_model_len)
- def verify_with_parallel_config(
- self,
- parallel_config: "ParallelConfig",
- ) -> None:
- total_num_attention_heads = getattr(self.hf_text_config,
- "num_attention_heads", 0)
- tensor_parallel_size = parallel_config.tensor_parallel_size
- if (total_num_attention_heads % tensor_parallel_size != 0
- and self.quantization is not None):
- raise ValueError(
- f"Total number of attention heads "
- f"({total_num_attention_heads})"
- " must be divisible by tensor parallel size "
- f"({tensor_parallel_size}) when quantization is used.")
- pipeline_parallel_size = parallel_config.pipeline_parallel_size
- architectures = getattr(self.hf_config, "architectures", [])
- if not all(arch in _PP_SUPPORTED_MODELS
- for arch in architectures) and pipeline_parallel_size > 1:
- raise NotImplementedError(
- "Pipeline parallelism is only supported for the following "
- f" architectures: {_PP_SUPPORTED_MODELS}.")
- if self.quantization == "bitsandbytes" and (
- parallel_config.tensor_parallel_size > 1
- or parallel_config.pipeline_parallel_size > 1):
- raise ValueError(
- "BitsAndBytes quantization with TP/PP is not supported yet.")
- if self.quantization == "bitsandbytes" and self.enforce_eager is False:
- raise ValueError(
- "BitsAndBytes with enforce_eager=False is not supported yet.")
- def get_hf_config_sliding_window(self) -> Optional[int]:
- """Get the sliding window size, or None if disabled.
- """
- # Some models, like Qwen2 and Qwen1.5, use `use_sliding_window` in
- # addition to sliding window size. We check if that field is present
- # and if it's False, return None.
- if (hasattr(self.hf_text_config, "use_sliding_window")
- and not self.hf_text_config.use_sliding_window):
- return None
- return getattr(self.hf_text_config, "sliding_window", None)
- def get_sliding_window(self) -> Optional[int]:
- """Get the sliding window size, or None if disabled.
- """
- # If user disables sliding window, return None.
- if self.disable_sliding_window:
- return None
- # Otherwise get the value from the hf config.
- return self.get_hf_config_sliding_window()
- def get_vocab_size(self) -> int:
- return self.hf_text_config.vocab_size
- def get_hidden_size(self) -> int:
- return self.hf_text_config.hidden_size
- def get_head_size(self) -> int:
- # TODO remove hard code
- if hasattr(self.hf_text_config, "model_type"
- ) and self.hf_text_config.model_type == 'deepseek_v2':
- # FlashAttention supports only head_size 32, 64, 128, 256,
- # we need to pad head_size 192 to 256
- return 256
- if hasattr(self.hf_text_config, "head_dim"):
- return self.hf_text_config.head_dim
- # FIXME: This may not be true for all models.
- return (self.hf_text_config.hidden_size //
- self.hf_text_config.num_attention_heads)
- def get_total_num_kv_heads(self) -> int:
- """Returns the total number of KV heads."""
- # For GPTBigCode & Falcon:
- # NOTE: for falcon, when new_decoder_architecture is True, the
- # multi_query flag is ignored and we use n_head_kv for the number of
- # KV heads.
- falcon_model_types = ["falcon", "RefinedWeb", "RefinedWebModel"]
- new_decoder_arch_falcon = (
- self.hf_config.model_type in falcon_model_types
- and getattr(self.hf_config, "new_decoder_architecture", False))
- if not new_decoder_arch_falcon and getattr(self.hf_text_config,
- "multi_query", False):
- # Multi-query attention, only one KV head.
- # Currently, tensor parallelism is not supported in this case.
- return 1
- # For DBRX and MPT
- if self.hf_config.model_type == "mpt":
- if "kv_n_heads" in self.hf_config.attn_config:
- return self.hf_config.attn_config["kv_n_heads"]
- return self.hf_config.num_attention_heads
- if self.hf_config.model_type == "dbrx":
- return getattr(self.hf_config.attn_config, "kv_n_heads",
- self.hf_config.num_attention_heads)
- attributes = [
- # For Falcon:
- "n_head_kv",
- "num_kv_heads",
- # For LLaMA-2:
- "num_key_value_heads",
- # For ChatGLM:
- "multi_query_group_num",
- ]
- for attr in attributes:
- num_kv_heads = getattr(self.hf_text_config, attr, None)
- if num_kv_heads is not None:
- return num_kv_heads
- # For non-grouped-query attention models, the number of KV heads is
- # equal to the number of attention heads.
- return self.hf_text_config.num_attention_heads
- def get_num_kv_heads(self,
- parallel_config: "ParallelConfig",
- tp_rank: int = 0) -> int:
- """Returns the number of KV heads per GPU."""
- total_num_kv_heads = self.get_total_num_kv_heads()
- # If tensor parallelism is used, we divide the number of KV heads by
- # the tensor parallel size. We will replicate the KV heads in the
- # case where the number of KV heads is smaller than the tensor
- # parallel size so each GPU has at least one KV head.
- result = get_current_tp_rank_partition_size(
- total_num_kv_heads, tp_rank, parallel_config.tensor_parallel_size)
- return max(1, result)
- def get_num_attention_heads(self,
- parallel_config: "ParallelConfig",
- tp_rank: int = 0) -> int:
- if getattr(self.hf_text_config, "num_attention_heads", None) is None:
- return 0
- num_total_kv_heads = self.get_total_num_kv_heads()
- num_kv_heads = self.get_num_kv_heads(parallel_config, tp_rank)
- num_total_attention_heads = self.hf_text_config.num_attention_heads
- num_heads_per_kv_head = num_total_attention_heads // num_total_kv_heads
- # For GQA attention we make sure the whole attention head group is
- # together on the same GPU.
- return num_kv_heads * num_heads_per_kv_head
- def get_num_layers(self, parallel_config: "ParallelConfig") -> int:
- from aphrodite.distributed.utils import get_pp_indices
- total_num_hidden_layers = getattr(self.hf_text_config,
- "num_hidden_layers", 0)
- pp_rank = parallel_config.rank // parallel_config.tensor_parallel_size
- pp_size = parallel_config.pipeline_parallel_size
- start, end = get_pp_indices(total_num_hidden_layers, pp_rank, pp_size)
- return end - start
- def contains_seqlen_agnostic_layers(
- self, parallel_config: "ParallelConfig") -> bool:
- """True for Mamba/SSM models (Jamba)"""
- return self._get_num_seqlen_agnostic_layers(parallel_config) > 0
- def get_layers_block_type(self,
- parallel_config: "ParallelConfig") -> List[str]:
- num_layers = self.get_num_layers(parallel_config)
- # Transformers supports layers_block_type @property
- return getattr(self.hf_config, "layers_block_type",
- ["attention"] * num_layers)
- def get_num_attention_layers(self,
- parallel_config: "ParallelConfig") -> int:
- return len([
- t for t in self.get_layers_block_type(parallel_config)
- if t == "attention"
- ])
- def _get_num_seqlen_agnostic_layers(
- self, parallel_config: "ParallelConfig") -> int:
- return len([
- t for t in self.get_layers_block_type(parallel_config)
- if t != "attention"
- ])
- class CacheConfig:
- """Configuration for the KV cache.
- Args:
- block_size: Size of a cache block in number of tokens.
- gpu_memory_utilization: Fraction of GPU memory to use for the
- Aphrodite execution.
- swap_space: Size of the CPU swap space per GPU (in GiB).
- cache_dtype: Data type for kv cache storage.
- num_gpu_blocks_override: Number of GPU blocks to use. This overrides the
- profiled num_gpu_blocks if specified. Does nothing if None.
- """
- def __init__(
- self,
- block_size: int,
- gpu_memory_utilization: float,
- swap_space: int,
- cache_dtype: str,
- num_gpu_blocks_override: Optional[int] = None,
- sliding_window: Optional[int] = None,
- enable_prefix_caching: bool = False,
- cpu_offload_gb: float = 0.0,
- ) -> None:
- self.block_size = block_size
- self.gpu_memory_utilization = gpu_memory_utilization
- self.swap_space_bytes = swap_space * _GB
- self.num_gpu_blocks_override = num_gpu_blocks_override
- self.cache_dtype = cache_dtype
- self.sliding_window = sliding_window
- self.enable_prefix_caching = enable_prefix_caching
- self.cpu_offload_gb = cpu_offload_gb
- self._verify_args()
- self._verify_cache_dtype()
- self._verify_prefix_caching()
- # Will be set after profiling.
- self.num_gpu_blocks = None
- self.num_cpu_blocks = None
- def metrics_info(self):
- # convert cache_config to dict(key: str, value: str) for prometheus
- # metrics info
- return {key: str(value) for key, value in self.__dict__.items()}
- def _verify_args(self) -> None:
- if self.gpu_memory_utilization > 1.0:
- raise ValueError(
- "GPU memory utilization must be less than 1.0. Got "
- f"{self.gpu_memory_utilization}.")
- def _verify_cache_dtype(self) -> None:
- if self.cache_dtype == "auto":
- pass
- elif self.cache_dtype in ("fp8", "fp8_e4m3", "fp8_e5m2"):
- logger.info(
- "Using fp8 data type to store kv cache. It reduces the GPU "
- "memory footprint and boosts the performance. "
- "Meanwhile, it may cause accuracy drop without a proper "
- "scaling factor")
- else:
- raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}")
- def _verify_prefix_caching(self) -> None:
- if not self.enable_prefix_caching:
- return
- if self.sliding_window is not None:
- raise NotImplementedError(
- "Prefix caching is not supported with sliding window. "
- "Run with --disable-sliding-window to use prefix caching.")
- if self.cache_dtype == "fp8":
- raise NotImplementedError(
- "Prefix caching is not supported for fp8 cache_dtype. "
- "Run with --kv-cache-dtype auto to use prefix caching.")
- def verify_with_parallel_config(
- self,
- parallel_config: "ParallelConfig",
- ) -> None:
- total_cpu_memory = get_cpu_memory()
- # FIXME: Here, it is assumed that the GPUs in a tensor parallel
- # group are in the same node. However, the GPUs may span multiple nodes.
- num_gpus_per_node = parallel_config.tensor_parallel_size
- cpu_memory_usage = self.swap_space_bytes * num_gpus_per_node
- msg = (f"{cpu_memory_usage / _GB:.2f} GiB out of "
- f"the {total_cpu_memory / _GB:.2f} GiB total CPU memory is "
- "allocated for the swap space.")
- if cpu_memory_usage > 0.7 * total_cpu_memory:
- raise ValueError("Too large swap space. " + msg)
- elif cpu_memory_usage > 0.4 * total_cpu_memory:
- logger.warning("Possibly too large swap space. " + msg)
- @dataclass
- class TokenizerPoolConfig:
- """Configuration for the tokenizer pool.
- Args:
- pool_size: Number of tokenizer workers in the pool.
- pool_type: Type of the pool.
- extra_config: Additional config for the pool.
- The way the config will be used depends on the
- pool type.
- """
- pool_size: int
- pool_type: Union[str, Type["BaseTokenizerGroup"]]
- extra_config: dict
- def __post_init__(self):
- if self.pool_type not in ("ray", ) and not isinstance(
- self.pool_type, type):
- raise ValueError(f"Unknown pool type: {self.pool_type}")
- if not isinstance(self.extra_config, dict):
- raise ValueError("extra_config must be a dictionary.")
- @classmethod
- def create_config(
- cls, tokenizer_pool_size: int, tokenizer_pool_type: str,
- tokenizer_pool_extra_config: Optional[Union[str, dict]]
- ) -> Optional["TokenizerPoolConfig"]:
- """Create a TokenizerPoolConfig from the given parameters.
- If tokenizer_pool_size is 0, return None.
- Args:
- tokenizer_pool_size: Number of tokenizer workers in the pool.
- tokenizer_pool_type: Type of the pool.
- tokenizer_pool_extra_config: Additional config for the pool.
- The way the config will be used depends on the
- pool type. This can be a JSON string (will be parsed).
- """
- if tokenizer_pool_size:
- if isinstance(tokenizer_pool_extra_config, str):
- tokenizer_pool_extra_config_parsed = json.loads(
- tokenizer_pool_extra_config)
- else:
- tokenizer_pool_extra_config_parsed = (
- tokenizer_pool_extra_config or {})
- tokenizer_pool_config = cls(tokenizer_pool_size,
- tokenizer_pool_type,
- tokenizer_pool_extra_config_parsed)
- else:
- tokenizer_pool_config = None
- return tokenizer_pool_config
- class LoadFormat(str, enum.Enum):
- AUTO = "auto"
- PT = "pt"
- SAFETENSORS = "safetensors"
- NPCACHE = "npcache"
- DUMMY = "dummy"
- TENSORIZER = "tensorizer"
- SHARDED_STATE = "sharded_state"
- GGUF = "gguf"
- BITSANDBYTES = "bitsandbytes"
- @dataclass
- class LoadConfig:
- """
- download_dir: Directory to download and load the weights, default to the
- default cache directory of huggingface.
- load_format: The format of the model weights to load:
- "auto" will try to load the weights in the safetensors format and
- fall back to the pytorch bin format if safetensors format is
- not available.
- "pt" will load the weights in the pytorch bin format.
- "safetensors" will load the weights in the safetensors format.
- "npcache" will load the weights in pytorch format and store
- a numpy cache to speed up the loading.
- "dummy" will initialize the weights with random values, which is
- mainly for profiling.
- "tensorizer" will use CoreWeave's tensorizer library for
- fast weight loading.
- ignore_patterns: The list of patterns to ignore when loading the model.
- Default to "original/**/*" to avoid repeated loading of llama's
- checkpoints.
- """
- load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO
- download_dir: Optional[str] = None
- model_loader_extra_config: Optional[Union[str, dict]] = field(
- default_factory=dict)
- ignore_patterns: Optional[Union[List[str], str]] = None
- def __post_init__(self):
- model_loader_extra_config = self.model_loader_extra_config or {}
- if isinstance(model_loader_extra_config, str):
- self.model_loader_extra_config = json.loads(
- model_loader_extra_config)
- self._verify_load_format()
- if self.ignore_patterns is not None and len(self.ignore_patterns) > 0:
- logger.info(
- "Ignoring the following patterns when downloading weights: "
- f"{self.ignore_patterns}")
- else:
- self.ignore_patterns = ["original/**/*", "consolidated*"]
- def _verify_load_format(self) -> None:
- if not isinstance(self.load_format, str):
- return
- load_format = self.load_format.lower()
- self.load_format = LoadFormat(load_format)
- rocm_not_supported_load_format: List[str] = []
- if is_hip() and load_format in rocm_not_supported_load_format:
- rocm_supported_load_format = [
- f for f in LoadFormat.__members__
- if (f not in rocm_not_supported_load_format)
- ]
- raise ValueError(
- f"load format '{load_format}' is not supported in ROCm. "
- f"Supported load formats are "
- f"{rocm_supported_load_format}")
- class ParallelConfig:
- """Configuration for the distributed execution.
- Args:
- pipeline_parallel_size: Number of pipeline parallel groups.
- tensor_parallel_size: Number of tensor parallel groups.
- worker_use_ray: Deprecated, use distributed_executor_backend instead.
- max_parallel_loading_workers: Maximum number of multiple batches
- when load model sequentially. To avoid RAM OOM when using tensor
- parallel and large models.
- disable_custom_all_reduce: Disable the custom all-reduce kernel and
- fall back to NCCL.
- tokenizer_pool_config: Config for the tokenizer pool.
- If None, will use synchronous tokenization.
- ray_workers_use_nsight: Whether to profile Ray workers with nsight, see
- https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler.
- placement_group: ray distributed model workers placement group.
- distributed_executor_backend: Backend to use for distributed model
- workers, either "ray" or "mp" (multiprocessing). If either
- pipeline_parallel_size or tensor_parallel_size is greater than 1,
- will default to "ray" if Ray is installed or "mp" otherwise.
- """
- def __init__(
- self,
- pipeline_parallel_size: int,
- tensor_parallel_size: int,
- worker_use_ray: Optional[bool] = None,
- max_parallel_loading_workers: Optional[int] = None,
- disable_custom_all_reduce: bool = False,
- tokenizer_pool_config: Optional[TokenizerPoolConfig] = None,
- ray_workers_use_nsight: bool = False,
- placement_group: Optional["PlacementGroup"] = None,
- distributed_executor_backend: Optional[Union[
- str, Type["ExecutorBase"]]] = None,
- ) -> None:
- self.pipeline_parallel_size = pipeline_parallel_size
- self.tensor_parallel_size = tensor_parallel_size
- self.distributed_executor_backend = distributed_executor_backend
- self.max_parallel_loading_workers = max_parallel_loading_workers
- self.disable_custom_all_reduce = disable_custom_all_reduce
- self.tokenizer_pool_config = tokenizer_pool_config
- self.ray_workers_use_nsight = ray_workers_use_nsight
- self.placement_group = placement_group
- self.world_size = pipeline_parallel_size * self.tensor_parallel_size
- if worker_use_ray:
- if self.distributed_executor_backend is None:
- self.distributed_executor_backend = "ray"
- elif not self.use_ray:
- raise ValueError(f"worker-use-ray can't be used with "
- f"distributed executor backend "
- f"'{self.distributed_executor_backend}'.")
- if self.distributed_executor_backend is None and self.world_size > 1:
- # We use multiprocessing by default if world_size fits on the
- # current node and we aren't in a ray placement group.
- from aphrodite.executor import ray_utils
- backend = "mp"
- ray_found = ray_utils.ray_is_available()
- if cuda_device_count_stateless() < self.world_size:
- if not ray_found:
- raise ValueError("Unable to load Ray which is "
- "required for multi-node inference, "
- "please install Ray with `pip install "
- "ray`.") from ray_utils.ray_import_err
- backend = "ray"
- elif ray_found:
- if self.placement_group:
- backend = "ray"
- else:
- from ray import is_initialized as ray_is_initialized
- if ray_is_initialized():
- from ray.util import get_current_placement_group
- if get_current_placement_group():
- backend = "ray"
- self.distributed_executor_backend = backend
- logger.info(
- f"Defaulting to use {backend} for distributed inference.")
- self._verify_args()
- self.rank = 0
- @property
- def use_ray(self) -> bool:
- return self.distributed_executor_backend == "ray" or (
- isinstance(self.distributed_executor_backend, type)
- and self.distributed_executor_backend.uses_ray)
- def _verify_args(self) -> None:
- # Lazy import to avoid circular import
- from aphrodite.executor.executor_base import ExecutorBase
- if self.distributed_executor_backend not in (
- "ray", "mp", None) and not (isinstance(
- self.distributed_executor_backend, type) and issubclass(
- self.distributed_executor_backend, ExecutorBase)):
- raise ValueError(
- "Unrecognized distributed executor backend "
- f"{self.distributed_executor_backend}. Supported "
- "values are 'ray', 'mp' or custom ExecutorBase subclass.")
- if self.use_ray:
- from aphrodite.executor import ray_utils
- ray_utils.assert_ray_available()
- if is_hip():
- self.disable_custom_all_reduce = True
- logger.info(
- "Disabled the custom all-reduce kernel because it is not "
- "supported on AMD GPUs.")
- if self.ray_workers_use_nsight and not self.use_ray:
- raise ValueError("Unable to use nsight profiling unless workers "
- "run with Ray.")
- class SchedulerConfig:
- """Scheduler configuration.
- Args:
- max_num_batched_tokens: Maximum number of tokens to be processed in
- a single iteration.
- max_num_seqs: Maximum number of sequences to be processed in a single
- iteration.
- max_model_len: Maximum length of a sequence (including prompt
- and generated text).
- use_v2_block_manager: Whether to use the BlockSpaceManagerV2 or not.
- num_lookahead_slots: The number of slots to allocate per sequence per
- step, beyond the known token ids. This is used in speculative
- decoding to store KV activations of tokens which may or may not be
- accepted.
- delay_factor: Apply a delay (of delay factor multiplied by previous
- prompt latency) before scheduling next prompt.
- enable_chunked_prefill: If True, prefill requests can be chunked based
- on the remaining max_num_batched_tokens.
- embedding_mode: Whether the running model is for embedding.
- preemption_mode: Whether to perform preemption by swapping or
- recomputation. If not specified, we determine the mode as follows:
- We use recomputation by default since it incurs lower overhead than
- swapping. However, when the sequence group has multiple sequences
- (e.g., beam search), recomputation is not currently supported. In
- such a case, we use swapping instead.
- """
- def __init__(self,
- max_num_batched_tokens: Optional[int],
- max_num_seqs: int,
- max_model_len: int,
- use_v2_block_manager: bool = False,
- num_lookahead_slots: int = 0,
- delay_factor: float = 0.0,
- enable_chunked_prefill: bool = False,
- embedding_mode: Optional[bool] = False,
- preemption_mode: Optional[str] = None) -> None:
- if max_num_batched_tokens is not None:
- self.max_num_batched_tokens = max_num_batched_tokens
- else:
- if enable_chunked_prefill:
- # For chunked prefill, choose the well-tuned batch size.
- self.max_num_batched_tokens = 768
- elif embedding_mode:
- # For embedding, choose specific value for higher throughput
- self.max_num_batched_tokens = max(
- max_model_len, _EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS)
- else:
- # If max_model_len is too short, use 2048 as the default value
- # for higher throughput.
- self.max_num_batched_tokens = max(max_model_len, 2048)
- if enable_chunked_prefill:
- logger.info(
- "Chunked prefill is enabled with "
- f"max_num_batched_tokens={self.max_num_batched_tokens}.")
- self.max_num_seqs = max_num_seqs
- self.max_model_len = max_model_len
- self.use_v2_block_manager = use_v2_block_manager
- self.num_lookahead_slots = num_lookahead_slots
- self.delay_factor = delay_factor
- self.chunked_prefill_enabled = enable_chunked_prefill
- self.embedding_mode = embedding_mode
- self.preemption_mode = preemption_mode
- self._verify_args()
- def _verify_args(self) -> None:
- if (self.max_num_batched_tokens < self.max_model_len
- and not self.chunked_prefill_enabled):
- raise ValueError(
- f"max_num_batched_tokens ({self.max_num_batched_tokens}) is "
- f"smaller than max_model_len ({self.max_model_len}). "
- "This effectively limits the maximum sequence length to "
- "max_num_batched_tokens and makes Aphrodite reject longer "
- "sequences. Please increase max_num_batched_tokens or "
- "decrease max_model_len.")
- if self.max_num_batched_tokens < self.max_num_seqs:
- raise ValueError(
- f"max_num_batched_tokens ({self.max_num_batched_tokens}) must "
- "be greater than or equal to max_num_seqs "
- f"({self.max_num_seqs}).")
- if self.num_lookahead_slots < 0:
- raise ValueError(
- "num_lookahead_slots "
- f"({self.num_lookahead_slots}) must be greater than or "
- "equal to 0.")
- class DeviceConfig:
- def __init__(self, device: str = "auto") -> None:
- if device == "auto":
- # Automated device type detection
- if is_neuron():
- self.device_type = "neuron"
- elif is_openvino():
- self.device_type = "openvino"
- elif is_tpu():
- self.device_type = "tpu"
- elif is_cpu():
- self.device_type = "cpu"
- elif is_xpu():
- self.device_type = "xpu"
- else:
- # We don't call torch.cuda.is_available() here to
- # avoid initializing CUDA before workers are forked
- self.device_type = "cuda"
- else:
- # Device type is assigned explicitly
- self.device_type = device
- # Some device types require processing inputs on CPU
- if self.device_type in ["neuron", "openvino"]:
- self.device = torch.device("cpu")
- elif self.device_type in ["tpu"]:
- self.device = None
- else:
- # Set device with device type
- self.device = torch.device(self.device_type)
- class SpeculativeConfig:
- """Configuration for speculative decoding.
- The configuration is currently specialized to draft-model speculative
- decoding with top-1 proposals.
- """
- @staticmethod
- def maybe_create_spec_config(
- target_model_config: ModelConfig,
- target_parallel_config: ParallelConfig,
- target_dtype: str,
- speculative_model: Optional[str],
- speculative_draft_tensor_parallel_size: Optional[int],
- num_speculative_tokens: Optional[int],
- speculative_max_model_len: Optional[int],
- enable_chunked_prefill: bool,
- use_v2_block_manager: bool,
- disable_log_stats: bool,
- speculative_disable_by_batch_size: Optional[int],
- ngram_prompt_lookup_max: Optional[int],
- ngram_prompt_lookup_min: Optional[int],
- draft_token_acceptance_method: str,
- typical_acceptance_sampler_posterior_threshold: Optional[float],
- typical_acceptance_sampler_posterior_alpha: Optional[float],
- disable_logprobs: Optional[bool],
- ) -> Optional["SpeculativeConfig"]:
- """Create a SpeculativeConfig if possible, else return None.
- This function attempts to create a SpeculativeConfig object based on the
- provided parameters. If the necessary conditions are met, it returns an
- instance of SpeculativeConfig. Otherwise, it returns None.
- Args:
- target_model_config (ModelConfig): The configuration of the target
- model.
- target_parallel_config (ParallelConfig): The parallel configuration
- for the target model.
- target_dtype (str): The data type used for the target model.
- speculative_model (Optional[str]): The name of the speculative
- model, if provided.
- num_speculative_tokens (Optional[int]): The number of speculative
- tokens, if provided. Will default to the number in the draft
- model config if present, otherwise is required.
- speculative_draft_tensor_parallel_size (Optional[int]): The degree
- of the tensor parallelism for the draft model.
- speculative_max_model_len (Optional[int]): The maximum model len of
- the speculative model. Used when testing the ability to skip
- speculation for some sequences.
- enable_chunked_prefill (bool): Whether Aphrodite is configured to
- use chunked prefill or not. Used for raising an error since its
- not yet compatible with spec decode.
- use_v2_block_manager (bool): Whether Aphrodite is configured to
- use the v2 block manager or not. Used for raising an error
- since the v2 block manager is required with spec decode.
- speculative_disable_by_batch_size (Optional[int]): Disable
- speculative decoding for new incoming requests when the number
- of enqueue requests is larger than this value, if provided.
- ngram_prompt_lookup_max (Optional[int]): Max size of ngram token
- window, if provided.
- ngram_prompt_lookup_min (Optional[int]): Min size of ngram token
- window, if provided.
- draft_token_acceptance_method (str): The method to use for
- accepting draft tokens. This can take two possible
- values 'rejection_sampler' and 'typical_acceptance_sampler'
- for RejectionSampler and TypicalAcceptanceSampler
- respectively.
- typical_acceptance_sampler_posterior_threshold (Optional[float]):
- A threshold value that sets a lower bound on the posterior
- probability of a token in the target model for it to be
- accepted. This threshold is used only when we use the
- TypicalAcceptanceSampler for token acceptance.
- typical_acceptance_sampler_posterior_alpha (Optional[float]):
- A scaling factor for the entropy-based threshold in the
- TypicalAcceptanceSampler.
- disable_logprobs (Optional[bool]): If set to True, token log
- probabilities are not returned during speculative decoding.
- If set to False, token log probabilities are returned
- according to the log probability settings in SamplingParams.
- If not specified, it defaults to True.
- Returns:
- Optional["SpeculativeConfig"]: An instance of SpeculativeConfig if
- the necessary conditions are met, else None.
- """
- if speculative_model is None:
- if num_speculative_tokens is not None:
- raise ValueError("num_speculative_tokens was provided without "
- "speculative_model.")
- return None
- if (speculative_disable_by_batch_size is not None
- and speculative_disable_by_batch_size < 2):
- raise ValueError("Expected the batch size threshold of disabling "
- "speculative decoding is > 1, but got "
- f"{speculative_disable_by_batch_size=}")
- if enable_chunked_prefill:
- raise ValueError(
- "Speculative decoding and chunked prefill are "
- f"currently mutually exclusive ({enable_chunked_prefill=}).")
- if not use_v2_block_manager:
- raise ValueError(
- "Speculative decoding requires usage of the V2 "
- "block manager. Enable it with --use-v2-block-manager.")
- # TODO: The user should be able to specify revision/quantization/max
- # model len for the draft model. It is not currently supported.
- draft_revision = None
- draft_code_revision = None
- draft_quantization = None
- if speculative_model == "[ngram]":
- if ngram_prompt_lookup_min is None:
- ngram_prompt_lookup_min = 1
- if ngram_prompt_lookup_max is None or ngram_prompt_lookup_max < 1:
- raise ValueError(f"{ngram_prompt_lookup_max=} must be > 0")
- if ngram_prompt_lookup_min < 1:
- raise ValueError(f"{ngram_prompt_lookup_min=} must be > 0")
- if ngram_prompt_lookup_min > ngram_prompt_lookup_max:
- raise ValueError(f"{ngram_prompt_lookup_min=} cannot be "
- f"larger than {ngram_prompt_lookup_max=}")
- # TODO: current we still need extract vocab_size from target model
- # config, in future, we may try refactoring it out, and set
- # draft related config as None here.
- draft_model_config = target_model_config
- draft_parallel_config = target_parallel_config
- else:
- ngram_prompt_lookup_max = 0
- ngram_prompt_lookup_min = 0
- draft_model_config = ModelConfig(
- model=speculative_model,
- tokenizer=target_model_config.tokenizer,
- tokenizer_mode=target_model_config.tokenizer_mode,
- trust_remote_code=target_model_config.trust_remote_code,
- dtype=target_model_config.dtype,
- seed=target_model_config.seed,
- revision=draft_revision,
- code_revision=draft_code_revision,
- tokenizer_revision=target_model_config.tokenizer_revision,
- max_model_len=None,
- quantization=draft_quantization,
- enforce_eager=target_model_config.enforce_eager,
- max_seq_len_to_capture=target_model_config.
- max_seq_len_to_capture,
- max_logprobs=target_model_config.max_logprobs,
- )
- draft_hf_config = draft_model_config.hf_config
- if (num_speculative_tokens is not None
- and hasattr(draft_hf_config, "num_lookahead_tokens")):
- draft_hf_config.num_lookahead_tokens = num_speculative_tokens
- n_predict = getattr(draft_hf_config, "n_predict", None)
- if n_predict is not None:
- if num_speculative_tokens is None:
- # Default to max value defined in draft model config.
- num_speculative_tokens = n_predict
- elif num_speculative_tokens > n_predict:
- # Verify provided value doesn't exceed the maximum
- # supported by the draft model.
- raise ValueError(
- "This speculative model supports a maximum of "
- f"num_speculative_tokens={n_predict}, but "
- f"{num_speculative_tokens=} was provided.")
- draft_model_config.max_model_len = (
- SpeculativeConfig._maybe_override_draft_max_model_len(
- speculative_max_model_len,
- draft_model_config.max_model_len,
- target_model_config.max_model_len,
- ))
- draft_parallel_config = (
- SpeculativeConfig.create_draft_parallel_config(
- target_parallel_config,
- speculative_draft_tensor_parallel_size))
- if num_speculative_tokens is None:
- raise ValueError(
- "num_speculative_tokens must be provided with "
- "speculative_model unless the draft model config contains an "
- "n_predict parameter.")
- if typical_acceptance_sampler_posterior_threshold is None:
- typical_acceptance_sampler_posterior_threshold = 0.09
- if typical_acceptance_sampler_posterior_alpha is None:
- typical_acceptance_sampler_posterior_alpha = 0.3
- if disable_logprobs is None:
- disable_logprobs = True
- return SpeculativeConfig(
- draft_model_config,
- draft_parallel_config,
- num_speculative_tokens,
- speculative_disable_by_batch_size,
- ngram_prompt_lookup_max,
- ngram_prompt_lookup_min,
- draft_token_acceptance_method=draft_token_acceptance_method,
- typical_acceptance_sampler_posterior_threshold=\
- typical_acceptance_sampler_posterior_threshold,
- typical_acceptance_sampler_posterior_alpha=\
- typical_acceptance_sampler_posterior_alpha,
- disable_logprobs=disable_logprobs,
- disable_log_stats=disable_log_stats,
- )
- @staticmethod
- def _maybe_override_draft_max_model_len(
- speculative_max_model_len: Optional[int],
- draft_max_model_len: int,
- target_max_model_len: int,
- ) -> int:
- """Determine the max sequence len for the draft model. This is usually
- the draft_max_model_len, but may be the target_max_model_len if it is
- less than the draft_max_model_len, or may be speculative_max_model_len
- if it is specified.
- This is necessary so that sequences do not exceed the capacity of the
- draft model or the target model.
- speculative_max_model_len is mainly used for testing that sequences can
- skip speculation.
- """
- if speculative_max_model_len is not None:
- if speculative_max_model_len > draft_max_model_len:
- raise ValueError(f"{speculative_max_model_len=} cannot be "
- f"larger than {draft_max_model_len=}")
- if speculative_max_model_len > target_max_model_len:
- raise ValueError(f"{speculative_max_model_len=} cannot be "
- f"larger than {target_max_model_len=}")
- return speculative_max_model_len
- return min(
- draft_max_model_len,
- target_max_model_len,
- )
- @staticmethod
- def create_draft_parallel_config(
- target_parallel_config: ParallelConfig,
- speculative_draft_tensor_parallel_size: Optional[int]
- ) -> ParallelConfig:
- """Create a parallel config for use by the draft worker.
- This is mostly a copy of the target parallel config, except the tp_size.
- """
- if speculative_draft_tensor_parallel_size is None:
- speculative_draft_tensor_parallel_size = \
- target_parallel_config.tensor_parallel_size
- elif speculative_draft_tensor_parallel_size != 1:
- # TODO: allow tp values larger than 1
- raise ValueError(
- f"{speculative_draft_tensor_parallel_size=} cannot be"
- f"other value than 1")
- draft_parallel_config = ParallelConfig(
- pipeline_parallel_size=target_parallel_config.
- pipeline_parallel_size,
- tensor_parallel_size=speculative_draft_tensor_parallel_size,
- distributed_executor_backend=target_parallel_config.
- distributed_executor_backend,
- max_parallel_loading_workers=target_parallel_config.
- max_parallel_loading_workers,
- disable_custom_all_reduce=target_parallel_config.
- disable_custom_all_reduce,
- tokenizer_pool_config=target_parallel_config.tokenizer_pool_config,
- ray_workers_use_nsight=target_parallel_config.
- ray_workers_use_nsight,
- placement_group=target_parallel_config.placement_group,
- )
- return draft_parallel_config
- def __init__(
- self,
- draft_model_config: ModelConfig,
- draft_parallel_config: ParallelConfig,
- num_speculative_tokens: int,
- speculative_disable_by_batch_size: Optional[int],
- ngram_prompt_lookup_max: Optional[int],
- ngram_prompt_lookup_min: Optional[int],
- draft_token_acceptance_method: str,
- typical_acceptance_sampler_posterior_threshold: float,
- typical_acceptance_sampler_posterior_alpha: float,
- disable_logprobs: bool,
- disable_log_stats: bool,
- ):
- """Create a SpeculativeConfig object.
- Args:
- draft_model_config: ModelConfig for the draft model.
- draft_parallel_config: ParallelConfig for the draft model.
- num_speculative_tokens: The number of tokens to sample from the
- draft model before scoring with the target model.
- speculative_disable_by_batch_size: Disable speculative
- decoding for new incoming requests when the number of
- enqueue requests is larger than this value.
- ngram_prompt_lookup_max: Max size of ngram token window.
- ngram_prompt_lookup_min: Min size of ngram token window.
- draft_token_acceptance_method (str): The method to use for
- accepting draft tokens. This can take two possible
- values 'rejection_sampler' and 'typical_acceptance_sampler'
- for RejectionSampler and TypicalAcceptanceSampler
- respectively.
- typical_acceptance_sampler_posterior_threshold (Optional[float]):
- A threshold value that sets a lower bound on the posterior
- probability of a token in the target model for it to be
- accepted. This threshold is used only when we use the
- TypicalAcceptanceSampler for token acceptance.
- typical_acceptance_sampler_posterior_alpha (Optional[float]):
- A scaling factor for the entropy-based threshold in the
- TypicalAcceptanceSampler.
- disable_logprobs: If set to True, token log probabilities will not
- be returned even if requested by sampling parameters. This
- reduces latency by skipping logprob calculation in proposal
- sampling, target sampling, and after accepted tokens are
- determined. If set to False, log probabilities will be
- returned.
- disable_log_stats: Whether to disable periodic printing of stage
- times in speculative decoding.
- """
- self.draft_model_config = draft_model_config
- self.draft_parallel_config = draft_parallel_config
- self.num_speculative_tokens = num_speculative_tokens
- self.speculative_disable_by_batch_size = \
- speculative_disable_by_batch_size
- self.ngram_prompt_lookup_max = ngram_prompt_lookup_max or 0
- self.ngram_prompt_lookup_min = ngram_prompt_lookup_min or 0
- self.draft_token_acceptance_method = draft_token_acceptance_method
- self.typical_acceptance_sampler_posterior_threshold = \
- typical_acceptance_sampler_posterior_threshold
- self.typical_acceptance_sampler_posterior_alpha = \
- typical_acceptance_sampler_posterior_alpha
- self.disable_logprobs = disable_logprobs
- self.disable_log_stats = disable_log_stats
- self._verify_args()
- def _verify_args(self) -> None:
- if self.num_speculative_tokens <= 0:
- raise ValueError("Expected num_speculative_tokens to be greater "
- f"than zero ({self.num_speculative_tokens}).")
- if self.draft_model_config:
- self.draft_model_config.verify_with_parallel_config(
- self.draft_parallel_config)
- # Validate and set draft token acceptance related settings.
- if (self.draft_token_acceptance_method is None):
- raise ValueError("draft_token_acceptance_method is not set. "
- "Expected values are rejection_sampler or "
- "typical_acceptance_sampler.")
- if (self.draft_token_acceptance_method != 'rejection_sampler'
- and self.draft_token_acceptance_method !=
- 'typical_acceptance_sampler'):
- raise ValueError(
- "Expected draft_token_acceptance_method to be either "
- "rejection_sampler or typical_acceptance_sampler. Instead it "
- f"is {self.draft_token_acceptance_method}")
- if (self.typical_acceptance_sampler_posterior_threshold < 0
- or self.typical_acceptance_sampler_posterior_alpha < 0):
- raise ValueError(
- "Expected typical_acceptance_sampler_posterior_threshold "
- "and typical_acceptance_sampler_posterior_alpha to be > 0. "
- "Instead found "
- f"typical_acceptance_sampler_posterior_threshold = "
- f"{self.typical_acceptance_sampler_posterior_threshold} and "
- f"typical_acceptance_sampler_posterior_alpha = "
- f"{self.typical_acceptance_sampler_posterior_alpha}")
- @property
- def num_lookahead_slots(self) -> int:
- """The number of additional slots the scheduler should allocate per
- step, in addition to the slots allocated for each known token.
- This is equal to the number of speculative tokens, as each speculative
- token must be scored.
- """
- return self.num_speculative_tokens
- def __repr__(self) -> str:
- if self.ngram_prompt_lookup_max > 0:
- draft_model = "[ngram]"
- else:
- draft_model = self.draft_model_config.model
- num_spec_tokens = self.num_speculative_tokens
- return f"SpeculativeConfig({draft_model=}, {num_spec_tokens=})"
- @dataclass
- class LoRAConfig:
- max_lora_rank: int
- max_loras: int
- fully_sharded_loras: bool = False
- max_cpu_loras: Optional[int] = None
- lora_dtype: Optional[torch.dtype] = None
- lora_extra_vocab_size: int = 256
- # This is a constant.
- lora_vocab_padding_size: ClassVar[int] = 256
- long_lora_scaling_factors: Optional[Tuple[float]] = None
- def __post_init__(self):
- # Setting the maximum rank to 256 should be able to satisfy the vast
- # majority of applications.
- possible_max_ranks = (8, 16, 32, 64, 128, 256)
- possible_lora_extra_vocab_size = (0, 256, 512)
- if self.max_lora_rank not in possible_max_ranks:
- raise ValueError(
- f"max_lora_rank ({self.max_lora_rank}) must be one of "
- f"{possible_max_ranks}.")
- if self.lora_extra_vocab_size not in possible_lora_extra_vocab_size:
- raise ValueError(
- f"lora_extra_vocab_size ({self.lora_extra_vocab_size}) "
- f"must be one of {possible_lora_extra_vocab_size}.")
- if self.max_loras < 1:
- raise ValueError(f"max_loras ({self.max_loras}) must be >= 1.")
- if self.max_cpu_loras is None:
- self.max_cpu_loras = self.max_loras
- elif self.max_cpu_loras < self.max_loras:
- raise ValueError(
- f"max_cpu_loras ({self.max_cpu_loras}) must be >= "
- f"max_loras ({self.max_loras})")
- def verify_with_model_config(self, model_config: ModelConfig):
- if self.lora_dtype in (None, "auto"):
- self.lora_dtype = model_config.dtype
- elif isinstance(self.lora_dtype, str):
- self.lora_dtype = getattr(torch, self.lora_dtype)
- if model_config.quantization and model_config.quantization not in [
- "awq", "gptq"
- ]:
- # TODO support all other quants
- logger.warning(f"{model_config.quantization} quantization is not "
- "tested with LoRA yet.")
- def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
- if scheduler_config.max_num_batched_tokens > 65528:
- raise ValueError(
- "Due to limitations of the custom LoRA CUDA kernel, "
- "max_num_batched_tokens must be <= 65528 when "
- "LoRA is enabled.")
- if scheduler_config.chunked_prefill_enabled:
- raise ValueError("LoRA is not supported with chunked prefill yet.")
- def verify_with_parallel_config(self, parallel_config: ParallelConfig):
- if self.lora_vocab_padding_size % parallel_config.world_size != 0:
- raise ValueError("LoRA vocab padding size must be divisible "
- "by world size.")
- @dataclass
- class PromptAdapterConfig:
- max_prompt_adapters: int
- max_prompt_adapter_token: int
- max_cpu_prompt_adapters: Optional[int] = None
- prompt_adapter_dtype: Optional[torch.dtype] = None
- def __post_init__(self):
- library_name = 'peft'
- try:
- __import__(library_name)
- except ImportError as e:
- raise ImportError(
- f"'{library_name}' is not installed for prompt adapter support."
- f"Please install it using 'pip install {library_name}'."
- ) from e
- if self.max_prompt_adapters < 1:
- raise ValueError(f"max_prompt_adapters "
- f"({self.max_prompt_adapters}) must be >= 1.")
- if self.max_prompt_adapter_token == 0:
- raise ValueError("max_prompt_adapter_token must be set.")
- if self.max_cpu_prompt_adapters is None:
- self.max_cpu_prompt_adapters = self.max_prompt_adapters
- def verify_with_model_config(self, model_config: ModelConfig):
- if self.prompt_adapter_dtype in (None, "auto"):
- self.prompt_adapter_dtype = model_config.dtype
- elif isinstance(self.prompt_adapter_dtype, str):
- self.prompt_adapter_dtype = getattr(torch,
- self.prompt_adapter_dtype)
- @dataclass
- class MultiModalConfig:
- """Configs the input data format and how models should run for
- multimodal models."""
- # TODO: Add configs to init vision tower or not.
- pass
- _STR_DTYPE_TO_TORCH_DTYPE = {
- "half": torch.float16,
- "float16": torch.float16,
- "float": torch.float32,
- "float32": torch.float32,
- "bfloat16": torch.bfloat16,
- }
- _ROCM_NOT_SUPPORTED_DTYPE = ["float", "float32"]
- def _get_and_verify_dtype(
- config: PretrainedConfig,
- dtype: Union[str, torch.dtype],
- ) -> torch.dtype:
- # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
- # because config.torch_dtype can be None.
- config_dtype = getattr(config, "torch_dtype", None)
- if config_dtype is None:
- config_dtype = torch.float32
- if isinstance(dtype, str):
- dtype = dtype.lower()
- if dtype == "auto":
- if config_dtype == torch.float32:
- if config.model_type == "gemma2":
- logger.info(
- "For Gemma 2, we downcast float32 to bfloat16 instead "
- "of float16 by default. Please specify `dtype` if you "
- "want to use float16.")
- torch_dtype = torch.bfloat16
- else:
- # Following the common practice, we use float16 for float32
- # models.
- torch_dtype = torch.float16
- else:
- torch_dtype = config_dtype
- else:
- if dtype not in _STR_DTYPE_TO_TORCH_DTYPE:
- raise ValueError(f"Unknown dtype: {dtype}")
- torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype]
- elif isinstance(dtype, torch.dtype):
- torch_dtype = dtype
- else:
- raise ValueError(f"Unknown dtype: {dtype}")
- if is_hip() and torch_dtype == torch.float32:
- rocm_supported_dtypes = [
- k for k, v in _STR_DTYPE_TO_TORCH_DTYPE.items()
- if (k not in _ROCM_NOT_SUPPORTED_DTYPE)
- ]
- raise ValueError(f"dtype '{dtype}' is not supported in ROCm. "
- f"Supported dtypes are {rocm_supported_dtypes}")
- # Verify the dtype.
- if torch_dtype != config_dtype:
- if torch_dtype == torch.float32:
- # Upcasting to float32 is allowed.
- pass
- elif config_dtype == torch.float32:
- # Downcasting from float32 to float16 or bfloat16 is allowed.
- pass
- else:
- # Casting between float16 and bfloat16 is allowed with a warning.
- logger.warning(f"Casting {config_dtype} to {torch_dtype}.")
- return torch_dtype
- def _get_and_verify_max_len(
- hf_config: PretrainedConfig,
- max_model_len: Optional[int],
- disable_sliding_window: bool,
- sliding_window_len: Optional[int],
- rope_scaling_arg: Optional[Dict[str, Any]],
- ) -> int:
- """Get and verify the model's maximum length."""
- derived_max_model_len = float("inf")
- possible_keys = [
- # Cohere: needs to prioritize this over "max_position_embeddings"
- "model_max_length",
- # OPT
- "max_position_embeddings",
- # GPT-2
- "n_positions",
- # MPT
- "max_seq_len",
- # ChatGLM2
- "seq_length",
- # Command-R
- "model_max_length",
- # Others
- "max_sequence_length",
- "max_seq_length",
- "seq_len",
- ]
- # Choose the smallest "max_length" from the possible keys.
- max_len_key = None
- for key in possible_keys:
- max_len = getattr(hf_config, key, None)
- if max_len is not None:
- max_len_key = key if max_len < derived_max_model_len \
- else max_len_key
- derived_max_model_len = min(derived_max_model_len, max_len)
- # If sliding window is manually disabled, max_length should be less
- # than the sliding window length in the model config.
- if disable_sliding_window and sliding_window_len is not None:
- max_len_key = "sliding_window" \
- if sliding_window_len < derived_max_model_len else max_len_key
- derived_max_model_len = min(derived_max_model_len, sliding_window_len)
- # If none of the keys were found in the config, use a default and
- # log a warning.
- if derived_max_model_len == float("inf"):
- if max_model_len is not None:
- # If max_model_len is specified, we use it.
- return max_model_len
- default_max_len = 2048
- logger.warning(
- "The model's config.json does not contain any of the following "
- "keys to determine the original maximum length of the model: "
- f"{possible_keys}. Assuming the model's maximum length is "
- f"{default_max_len}.")
- derived_max_model_len = default_max_len
- rope_scaling = getattr(hf_config, "rope_scaling", None)
- if rope_scaling is not None:
- rope_type = rope_scaling.get("type", rope_scaling.get("rope_type"))
- if rope_type not in {"su", "longrope", "llama3"}:
- if disable_sliding_window:
- # TODO: Find a model that supports rope_scaling
- # with sliding window to see if this case should be allowed.
- raise NotImplementedError(
- "Disabling sliding window is not supported for models "
- "with rope_scaling. Please raise an issue so we can "
- "investigate.")
- assert "factor" in rope_scaling
- scaling_factor = rope_scaling["factor"]
- if rope_type == "yarn":
- derived_max_model_len = rope_scaling[
- "original_max_position_embeddings"]
- derived_max_model_len *= scaling_factor
- if max_model_len is None:
- max_model_len = derived_max_model_len
- elif max_model_len > derived_max_model_len and rope_scaling_arg is None:
- raise ValueError(
- f"User-specified max_model_len {max_model_len} is higher than "
- f"the original {derived_max_model_len}. "
- "Please provide a rope_scaling dict to scale the model.")
- elif max_model_len > derived_max_model_len and rope_scaling_arg is not None:
- # hope this works
- logger.warning(
- f"User-specified max_model_len {max_model_len} is higher than "
- f"the original {derived_max_model_len}. "
- "Attempting to use RoPE scaling with the provided rope_scaling "
- "dict.")
- derived_max_model_len = max_model_len
- return int(max_model_len)
- def get_served_model_name(model: str,
- served_model_name: Optional[Union[str, List[str]]]):
- """
- If the input is a non-empty list, the first model_name in
- `served_model_name` is taken.
- If the input is a non-empty string, it is used directly.
- For cases where the input is either an empty string or an
- empty list, the fallback is to use `self.model`.
- """
- if not served_model_name:
- return model
- if isinstance(served_model_name, list):
- return served_model_name[0]
- return served_model_name
- @dataclass
- class DecodingConfig:
- """Dataclass which contains the decoding strategy of the engine"""
- # Which guided decoding algo to use. 'outlines' / 'lm-format-enforcer'
- guided_decoding_backend: str = 'outlines'
- def __post_init__(self):
- valid_guided_backends = ['outlines', 'lm-format-enforcer']
- backend = self.guided_decoding_backend
- if backend not in valid_guided_backends:
- raise ValueError(f"Invalid guided_decoding_backend '{backend},"
- f"must be one of {valid_guided_backends}")
- @dataclass(frozen=True)
- class EngineConfig:
- """Dataclass which contains all engine-related configuration. This
- simplifies passing around the distinct configurations in the codebase.
- """
- model_config: ModelConfig
- cache_config: CacheConfig
- parallel_config: ParallelConfig
- scheduler_config: SchedulerConfig
- device_config: DeviceConfig
- load_config: LoadConfig
- lora_config: Optional[LoRAConfig]
- multimodal_config: Optional[MultiModalConfig]
- speculative_config: Optional[SpeculativeConfig]
- decoding_config: Optional[DecodingConfig]
- prompt_adapter_config: Optional[PromptAdapterConfig]
- def __post_init__(self):
- """Verify configs are valid & consistent with each other.
- """
- self.model_config.verify_with_parallel_config(self.parallel_config)
- self.cache_config.verify_with_parallel_config(self.parallel_config)
- if self.lora_config:
- self.lora_config.verify_with_model_config(self.model_config)
- self.lora_config.verify_with_scheduler_config(
- self.scheduler_config)
- self.lora_config.verify_with_parallel_config(self.parallel_config)
- if self.prompt_adapter_config:
- self.prompt_adapter_config.verify_with_model_config(
- self.model_config)
- def to_dict(self):
- """Return the configs as a dictionary, for use in **kwargs.
- """
- return dict(
- (field.name, getattr(self, field.name)) for field in fields(self))
|