1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939 |
- import enum
- import json
- import os
- from dataclasses import dataclass, field, fields
- from typing import (TYPE_CHECKING, Any, ClassVar, Dict, List, Mapping,
- Optional, Tuple, Type, Union)
- import torch
- from loguru import logger
- from transformers import PretrainedConfig
- import aphrodite.common.envs as envs
- from aphrodite.common.utils import (STR_NOT_IMPL_ENC_DEC_CUDAGRAPH, GiB_bytes,
- cuda_device_count_stateless,
- get_cpu_memory, is_cpu, is_hip, is_neuron,
- is_openvino, is_xpu, print_warning_once)
- from aphrodite.distributed import get_current_tp_rank_partition_size
- from aphrodite.modeling.models import ModelRegistry
- from aphrodite.platforms import current_platform
- from aphrodite.quantization import QUANTIZATION_METHODS
- from aphrodite.transformers_utils.config import (ConfigFormat, get_config,
- get_hf_image_processor_config,
- get_hf_text_config)
- from aphrodite.triton_utils import HAS_TRITON
- if TYPE_CHECKING:
- from ray.util.placement_group import PlacementGroup
- from aphrodite.executor.executor_base import ExecutorBase
- from aphrodite.modeling.model_loader.loader import BaseModelLoader
- from aphrodite.transformers_utils.tokenizer_group.base_tokenizer_group import ( # noqa: E501
- BaseTokenizerGroup)
- # If true, will load models from ModelScope instead of Hugging Face Hub.
- APHRODITE_USE_MODELSCOPE = envs.APHRODITE_USE_MODELSCOPE
- _EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768
- _PP_SUPPORTED_MODELS = [
- "AquilaModel",
- "AquilaForCausalLM",
- "InternLMForCausalLM",
- "LlamaForCausalLM",
- "LLaMAForCausalLM",
- "MistralForCausalLM",
- "Phi3ForCausalLM",
- "MixtralForCausalLM",
- "NemotronForCausalLM",
- "Qwen2ForCausalLM",
- "Qwen2MoeForCausalLM",
- ]
- _OPTIMIZED_QUANTS = [
- "fp2",
- "fp3",
- "fp4",
- "fp5",
- "fp6",
- "fp7",
- "fp8",
- "marlin",
- "gptq_marlin_24",
- "gptq_marlin",
- "awq_marlin",
- "fbgemm_fp8",
- "compressed-tensors",
- "compressed_tensors",
- "experts_int8",
- "quant_llm",
- ]
- class ModelConfig:
- """Configuration for the model.
- Args:
- model: Name or path of the huggingface model to use.
- It is also used as the content for `model_name` tag in metrics
- output when `served_model_name` is not specified.
- tokenizer: Name or path of the huggingface tokenizer to use.
- tokenizer_mode: Tokenizer mode. "auto" will use the fast tokenizer if
- available, "slow" will always use the slow tokenizer, and
- "mistral" will always use the tokenizer from `mistral_common`.
- trust_remote_code: Trust remote code (e.g., from HuggingFace) when
- downloading the model and tokenizer.
- dtype: Data type for model weights and activations. The "auto" option
- will use FP16 precision for FP32 and FP16 models, and BF16 precision
- for BF16 models.
- seed: Random seed for reproducibility.
- revision: The specific model version to use. It can be a branch name,
- a tag name, or a commit id. If unspecified, will use the default
- version.
- code_revision: The specific revision to use for the model code on
- Hugging Face Hub. It can be a branch name, a tag name, or a
- commit id. If unspecified, will use the default version.
- rope_scaling: Dictionary containing the scaling configuration for the
- RoPE embeddings. When using this flag, don't update
- `max_position_embeddings` to the expected new maximum.
- tokenizer_revision: The specific tokenizer version to use. It can be a
- branch name, a tag name, or a commit id. If unspecified, will use
- the default version.
- max_model_len: Maximum length of a sequence (including prompt and
- output). If None, will be derived from the model.
- quantization: Quantization method that was used to quantize the model
- weights. If None, we assume the model weights are not quantized.
- deepspeed_fp_bits: Number of bits to use for DeepSpeed FP quantization.
- Supported number of bits are: 4, 6, 8, 12.
- quant_llm_fp_bits: Number of bits to use for QuantLLM FP quantization.
- Supported number of bits are: 5, 6, 7.
- quantization_param_path: Path to JSON file containing scaling factors.
- Used to load KV cache scaling factors into the model when KV cache
- type is FP8_E4M3 on ROCm (AMD GPU). In the future these will also
- be used to load activation and weight scaling factors when the
- model dtype is FP8_E4M3 on ROCm.
- enforce_eager: Whether to enforce eager execution. If True, we will
- disable CUDA graph and always execute the model in eager mode.
- If False, we will use CUDA graph and eager execution in hybrid.
- If None, the user did not specify, so default to False -
- except for encoder/decoder models, which currently require
- eager mode.
- max_context_len_to_capture: Maximum context len covered by CUDA graphs.
- When a sequence has context length larger than this, we fall back
- to eager mode (DEPRECATED. Use max_seq_len_to_capture instead).
- max_seq_len_to_capture: Maximum sequence len covered by CUDA graphs.
- When a sequence has context length larger than this, we fall back
- to eager mode
- disable_sliding_window: Whether to disable sliding window. If True,
- we will disable the sliding window functionality of the model.
- If the model does not support sliding window, this argument is
- ignored.
- skip_tokenizer_init: If true, skip initialization of tokenizer and
- detokenizer.
- served_model_name: The model name used in metrics tag `model_name`,
- matches the model name exposed via the APIs. If multiple model
- names provided, the first name will be used. If not specified,
- the model name will be the same as `model`.
- limit_mm_per_prompt: Maximum number of data instances per modality
- per prompt. Only applicable for multimodal models.
- config_format: The config format which will be loaded. Defaults to
- 'auto' which defaults to 'hf'.
- """
- def __init__(
- self,
- model: str,
- tokenizer: str,
- tokenizer_mode: str,
- trust_remote_code: bool,
- dtype: Union[str, torch.dtype],
- seed: int,
- revision: Optional[str] = None,
- code_revision: Optional[str] = None,
- rope_scaling: Optional[dict] = None,
- rope_theta: Optional[float] = None,
- tokenizer_revision: Optional[str] = None,
- max_model_len: Optional[int] = None,
- spec_target_max_model_len: Optional[int] = None,
- quantization: Optional[str] = None,
- deepspeed_fp_bits: Optional[int] = None,
- quant_llm_fp_bits: Optional[int] = None,
- quant_llm_exp_bits: Optional[int] = None,
- quantization_param_path: Optional[str] = None,
- enforce_eager: Optional[bool] = None,
- max_context_len_to_capture: Optional[int] = None,
- max_seq_len_to_capture: Optional[int] = None,
- max_logprobs: int = 5,
- disable_sliding_window: bool = False,
- skip_tokenizer_init: bool = False,
- served_model_name: Optional[Union[str, List[str]]] = None,
- limit_mm_per_prompt: Optional[Mapping[str, int]] = None,
- config_format: ConfigFormat = ConfigFormat.AUTO,
- ) -> None:
- self.model = model
- self.tokenizer = tokenizer
- self.tokenizer_mode = tokenizer_mode
- self.trust_remote_code = trust_remote_code
- self.seed = seed
- self.revision = revision
- self.code_revision = code_revision
- self.rope_scaling = rope_scaling
- self.rope_theta = rope_theta
- # The tokenizer version is consistent with the model version by default.
- if tokenizer_revision is None:
- self.tokenizer_revision = revision
- else:
- self.tokenizer_revision = tokenizer_revision
- self.quantization = quantization
- self.deepspeed_fp_bits = deepspeed_fp_bits
- self.quant_llm_fp_bits = quant_llm_fp_bits
- self.quant_llm_exp_bits = quant_llm_exp_bits
- self.quantization_param_path = quantization_param_path
- self.enforce_eager = enforce_eager
- self.max_context_len_to_capture = max_context_len_to_capture
- if self.max_context_len_to_capture is not None:
- raise ValueError("`max_context_len_to_capture` is deprecated. "
- "Use `max_seq_len_to_capture` instead.")
- self.max_seq_len_to_capture = (max_seq_len_to_capture
- or max_context_len_to_capture)
- self.max_logprobs = max_logprobs
- self.disable_sliding_window = disable_sliding_window
- self.skip_tokenizer_init = skip_tokenizer_init
- self.hf_config = get_config(self.model, trust_remote_code, revision,
- code_revision, rope_scaling, rope_theta,
- config_format)
- self.hf_text_config = get_hf_text_config(self.hf_config)
- self.hf_image_processor_config = get_hf_image_processor_config(
- self.model, revision)
- self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
- # Choose a default enforce_eager value if the user did not specify
- # a value (enforce_eager is None)
- if getattr(self.hf_config, 'is_encoder_decoder', False):
- if self.enforce_eager is None:
- # *Only for encoder/decoder models* and
- # *only if enforce_eager is unset*, override
- # to enforce_eager=True
- #
- # Add a logger message since it is *somewhat* non-intuitive that
- # enforce_eager is True when the user has not specified its
- # value.
- logger.info("Forcing enforce_eager == True because "
- "enforce_eager setting was unspecified and "
- "CUDAGraph is not supported with encoder/ "
- "decoder models.")
- self.enforce_eager = True
- if not self.enforce_eager:
- # Eager mode explicitly disabled by user for an encoder/
- # decoder model; however CUDAGRAPH + encoder/decoder is
- # not currently supported
- raise ValueError(STR_NOT_IMPL_ENC_DEC_CUDAGRAPH)
- elif self.enforce_eager is None:
- # *Only for decoder-only models*, enforce_eager
- # defaults to False if unset. This is intuitive
- # so no logging message needed.
- self.enforce_eager = False
- sliding_window = getattr(self.hf_text_config, "sliding_window", None)
- has_interleaved_attention = (sliding_window is not None) and (
- isinstance(sliding_window, list) or
- (self.hf_text_config.model_type in ["gemma2"]))
- if (not self.disable_sliding_window and has_interleaved_attention):
- sliding_window_len_min = get_min_sliding_window(
- self.hf_text_config.sliding_window)
- print_warning_once(
- f"{self.hf_text_config.model_type} has interleaved attention, "
- "which is currently not supported by vLLM. Disabling sliding "
- "window and capping the max length to the sliding window size "
- f"({sliding_window_len_min}).")
- self.disable_sliding_window = True
- self.max_model_len = _get_and_verify_max_len(
- hf_config=self.hf_text_config,
- max_model_len=max_model_len,
- disable_sliding_window=self.disable_sliding_window,
- sliding_window_len=self.get_hf_config_sliding_window(),
- spec_target_max_model_len=spec_target_max_model_len,
- rope_scaling_arg=self.rope_scaling)
- self.served_model_name = get_served_model_name(model,
- served_model_name)
- self.multimodal_config = self._init_multimodal_config(
- limit_mm_per_prompt)
- if not self.skip_tokenizer_init:
- self._verify_tokenizer_mode()
- self._verify_embedding_mode()
- self._verify_quantization()
- self._verify_cuda_graph()
- def _init_multimodal_config(
- self, limit_mm_per_prompt: Optional[Mapping[str, int]]
- ) -> Optional["MultiModalConfig"]:
- architectures = getattr(self.hf_config, "architectures", [])
- if any(
- ModelRegistry.is_multimodal_model(arch)
- for arch in architectures):
- return MultiModalConfig(limit_per_prompt=limit_mm_per_prompt or {})
- else:
- if limit_mm_per_prompt:
- raise ValueError(
- "limit_mm_per_prompt is only supported for multimodal "
- "models.")
- return None
- def _verify_tokenizer_mode(self) -> None:
- tokenizer_mode = self.tokenizer_mode.lower()
- if tokenizer_mode not in ["auto", "slow", "mistral"]:
- raise ValueError(
- f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be "
- "either 'auto', 'slow' or 'mistral'.")
- self.tokenizer_mode = tokenizer_mode
- def _verify_embedding_mode(self) -> None:
- architectures = getattr(self.hf_config, "architectures", [])
- self.embedding_mode = any(
- ModelRegistry.is_embedding_model(arch) for arch in architectures)
- def _parse_quant_hf_config(self):
- quant_cfg = getattr(self.hf_config, "quantization_config", None)
- if quant_cfg is None:
- # compress-tensors uses a "compression_config" key
- quant_cfg = getattr(self.hf_config, "compression_config", None)
- return quant_cfg
- def _verify_quantization(self) -> None:
- supported_quantization = [*QUANTIZATION_METHODS]
- rocm_supported_quantization = ["gptq", "squeezellm", "fp8"]
- tpu_supported_quantization = ["tpu_int8"]
- if self.quantization is not None:
- self.quantization = self.quantization.lower()
- # Parse quantization method from the HF model config, if available.
- quant_cfg = self._parse_quant_hf_config()
- if quant_cfg is not None:
- quant_method = quant_cfg.get("quant_method", "").lower()
- # Detect which checkpoint is it
- for _, method in QUANTIZATION_METHODS.items():
- quantization_override = method.override_quantization_method(
- quant_cfg, self.quantization)
- if quantization_override:
- if quantization_override == "awq_marlin":
- quant_method = quant_method
- logger.warning(
- "awq_marlin kernels are temporarily disabled, "
- "they will be re-enabled with a future release. "
- "Falling back to AWQ kernels.")
- else:
- quant_method = quantization_override
- self.quantization = quantization_override
- break
- # Verify quantization configurations.
- if self.quantization is None:
- self.quantization = quant_method
- elif self.quantization != quant_method:
- raise ValueError(
- "Quantization method specified in the model config "
- f"({quant_method}) does not match the quantization "
- f"method specified in the `quantization` argument "
- f"({self.quantization}).")
- if self.quantization == "deepspeedfp":
- gs = 32 if self.deepspeed_fp_bits == 4 else 128
- self.hf_config.quantization_config = {
- "bits": self.deepspeed_fp_bits,
- "group_size": int(os.environ.get("DEEPSPEED_GROUP_SIZE", gs)),
- "quant_method": "deepspeedfp"
- }
- VALID_QUANT_LLM_FP_BITS = [2, 3, 4, 5, 6, 7]
- VALID_QUANT_LLM_EXPONENTS = [1, 2, 3, 4, 5]
- # The formula is mantissa_bits = fp_bits - exp_bits - 1
- # The default exp_bits for each fp_bits are as follows:
- DEFAULT_EXP_BITS = {
- 2: 1,
- 3: 2,
- 4: 2,
- 5: 2,
- 6: 2,
- 7: 3,
- }
- if self.quantization == "quant_llm":
- if self.quant_llm_fp_bits is None:
- raise ValueError(
- "quant_llm_fp_bits must be specified when using "
- "quant_llm quantization."
- )
- if self.quant_llm_fp_bits not in VALID_QUANT_LLM_FP_BITS:
- raise ValueError(
- f"Invalid quant_llm_fp_bits: {self.quant_llm_fp_bits}. "
- f"Must be one of {VALID_QUANT_LLM_FP_BITS}."
- )
- if self.quant_llm_exp_bits is None:
- self.quant_llm_exp_bits = DEFAULT_EXP_BITS[
- self.quant_llm_fp_bits]
- else:
- if self.quant_llm_exp_bits not in VALID_QUANT_LLM_EXPONENTS:
- raise ValueError(
- f"Invalid exponent bits: {self.quant_llm_exp_bits}. "
- f"Must be one of {VALID_QUANT_LLM_EXPONENTS}."
- )
- self.hf_config.quantization_config = {
- "bits": self.quant_llm_fp_bits,
- "exp_bits": self.quant_llm_exp_bits,
- "quant_method": "quant_llm"
- }
-
- online_quant_methods = ["fp2", "fp3", "fp4", "fp5", "fp6", "fp7"]
- if self.quantization is not None and self.quantization in \
- online_quant_methods:
- fp_bits = int(self.quantization[2])
- if fp_bits not in VALID_QUANT_LLM_FP_BITS:
- raise ValueError(
- f"Invalid quant_llm_fp_bits: {fp_bits}. "
- f"Must be one of {VALID_QUANT_LLM_FP_BITS}."
- )
- if fp_bits in [2, 3]:
- logger.warning("FP2 and FP3 quantization methods lead to "
- "significant accuracy loss. Use them with "
- "caution. Model may be incoherent.")
- exp_bits = DEFAULT_EXP_BITS[fp_bits]
- self.hf_config.quantization_config = {
- "bits": fp_bits,
- "exp_bits": exp_bits,
- "quant_method": self.quantization
- }
- self.dtype = torch.float16
- self.enforce_eager = True
- if self.quantization is not None:
- if self.quantization not in supported_quantization:
- raise ValueError(
- f"Unknown quantization method: {self.quantization}. Must "
- f"be one of {supported_quantization}.")
- if is_hip(
- ) and self.quantization not in rocm_supported_quantization:
- raise ValueError(
- f"{self.quantization} quantization is currently not "
- "supported in ROCm.")
- if current_platform.is_tpu(
- ) and self.quantization not in tpu_supported_quantization:
- raise ValueError(
- f"{self.quantization} quantization is currently not "
- f"supported in TPU Backend.")
- if self.quantization not in _OPTIMIZED_QUANTS:
- logger.warning(
- f"{self.quantization} quantization is not fully "
- "optimized yet. The speed can be slower than "
- "non-quantized models.")
- if self.quantization == "deepspeedfp" and self.deepspeed_fp_bits \
- is None:
- raise ValueError(
- "deepspeed_fp_bits must be specified when using "
- "deepspeedfp quantization.")
- def _verify_cuda_graph(self) -> None:
- if self.max_seq_len_to_capture is None:
- self.max_seq_len_to_capture = self.max_model_len
- self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
- self.max_model_len)
- def verify_with_parallel_config(
- self,
- parallel_config: "ParallelConfig",
- ) -> None:
- total_num_attention_heads = getattr(self.hf_text_config,
- "num_attention_heads", 0)
- tensor_parallel_size = parallel_config.tensor_parallel_size
- if (total_num_attention_heads % tensor_parallel_size != 0
- and self.quantization is not None):
- raise ValueError(
- f"Total number of attention heads "
- f"({total_num_attention_heads})"
- " must be divisible by tensor parallel size "
- f"({tensor_parallel_size}) when quantization is used.")
- pipeline_parallel_size = parallel_config.pipeline_parallel_size
- architectures = getattr(self.hf_config, "architectures", [])
- if not all(arch in _PP_SUPPORTED_MODELS
- for arch in architectures) and pipeline_parallel_size > 1:
- raise NotImplementedError(
- "Pipeline parallelism is only supported for the following "
- f" architectures: {_PP_SUPPORTED_MODELS}.")
- if self.quantization == "bitsandbytes" and (
- parallel_config.tensor_parallel_size > 1
- or parallel_config.pipeline_parallel_size > 1):
- raise ValueError(
- "BitsAndBytes quantization with TP/PP is not supported yet.")
- if self.quantization == "bitsandbytes" and self.enforce_eager is False:
- raise ValueError(
- "BitsAndBytes with enforce_eager=False is not supported yet.")
- def is_attention_free(self) -> bool:
- """Returns True if the model has no attention, i.e. the model has no
- state that grows with the size of the context.
- """
- # Return true if the model is mamba.
- # This check should be augmented with more models in the future,
- # and made more robust if possible.
- if hasattr(self.hf_text_config,
- "model_type") and self.hf_text_config.model_type == 'mamba':
- return True
- return False
- def get_hf_config_sliding_window(
- self) -> Union[Optional[int], List[Optional[int]]]:
- """Get the sliding window size, or None if disabled.
- """
- # Some models, like Qwen2 and Qwen1.5, use `use_sliding_window` in
- # addition to sliding window size. We check if that field is present
- # and if it's False, return None.
- if (hasattr(self.hf_text_config, "use_sliding_window")
- and not self.hf_text_config.use_sliding_window):
- return None
- return getattr(self.hf_text_config, "sliding_window", None)
- def get_sliding_window(self) -> Optional[Union[int, List[Optional[int]]]]:
- """Get the sliding window size, or None if disabled.
- """
- # If user disables sliding window, return None.
- if self.disable_sliding_window:
- return None
- # Otherwise get the value from the hf config.
- return self.get_hf_config_sliding_window()
- def get_vocab_size(self) -> int:
- return self.hf_text_config.vocab_size
- def get_hidden_size(self) -> int:
- return self.hf_text_config.hidden_size
- def get_head_size(self) -> int:
- # TODO remove hard code
- spec_model_types = ["medusa", "mlp_speculator"]
- if hasattr(self.hf_text_config, "model_type"
- ) and self.hf_text_config.model_type == 'deepseek_v2':
- # FlashAttention supports only head_size 32, 64, 128, 256,
- # we need to pad head_size 192 to 256
- return 256
- if self.is_attention_free() or \
- self.hf_text_config.model_type in spec_model_types:
- return 0
- if hasattr(self.hf_text_config, "head_dim"):
- return self.hf_text_config.head_dim
- # FIXME: This may not be true for all models.
- return (self.hf_text_config.hidden_size //
- self.hf_text_config.num_attention_heads)
- def get_total_num_kv_heads(self) -> int:
- """Returns the total number of KV heads."""
- # For GPTBigCode & Falcon:
- # NOTE: for falcon, when new_decoder_architecture is True, the
- # multi_query flag is ignored and we use n_head_kv for the number of
- # KV heads.
- falcon_model_types = ["falcon", "RefinedWeb", "RefinedWebModel"]
- new_decoder_arch_falcon = (
- self.hf_config.model_type in falcon_model_types
- and getattr(self.hf_config, "new_decoder_architecture", False))
- if not new_decoder_arch_falcon and getattr(self.hf_text_config,
- "multi_query", False):
- # Multi-query attention, only one KV head.
- # Currently, tensor parallelism is not supported in this case.
- return 1
- # For DBRX and MPT
- if self.hf_config.model_type == "mpt":
- if "kv_n_heads" in self.hf_config.attn_config:
- return self.hf_config.attn_config["kv_n_heads"]
- return self.hf_config.num_attention_heads
- if self.hf_config.model_type == "dbrx":
- return getattr(self.hf_config.attn_config, "kv_n_heads",
- self.hf_config.num_attention_heads)
-
- if self.is_attention_free():
- return 0
- attributes = [
- # For Falcon:
- "n_head_kv",
- "num_kv_heads",
- # For LLaMA-2:
- "num_key_value_heads",
- # For ChatGLM:
- "multi_query_group_num",
- ]
- for attr in attributes:
- num_kv_heads = getattr(self.hf_text_config, attr, None)
- if num_kv_heads is not None:
- return num_kv_heads
- # For non-grouped-query attention models, the number of KV heads is
- # equal to the number of attention heads.
- return self.hf_text_config.num_attention_heads
- def get_num_kv_heads(self,
- parallel_config: "ParallelConfig",
- tp_rank: int = 0) -> int:
- """Returns the number of KV heads per GPU."""
- total_num_kv_heads = self.get_total_num_kv_heads()
- # If tensor parallelism is used, we divide the number of KV heads by
- # the tensor parallel size. We will replicate the KV heads in the
- # case where the number of KV heads is smaller than the tensor
- # parallel size so each GPU has at least one KV head.
- result = get_current_tp_rank_partition_size(
- total_num_kv_heads, tp_rank, parallel_config.tensor_parallel_size)
- return max(1, result)
- def get_num_attention_heads(self,
- parallel_config: "ParallelConfig",
- tp_rank: int = 0) -> int:
- if getattr(self.hf_text_config, "num_attention_heads", None) is None:
- return 0
- num_total_kv_heads = self.get_total_num_kv_heads()
- num_kv_heads = self.get_num_kv_heads(parallel_config, tp_rank)
- num_total_attention_heads = self.hf_text_config.num_attention_heads
- num_heads_per_kv_head = num_total_attention_heads // num_total_kv_heads
- # For GQA attention we make sure the whole attention head group is
- # together on the same GPU.
- return num_kv_heads * num_heads_per_kv_head
- def get_num_layers(self, parallel_config: "ParallelConfig") -> int:
- from aphrodite.distributed.utils import get_pp_indices
- total_num_hidden_layers = getattr(self.hf_text_config,
- "num_hidden_layers", 0)
- pp_rank = parallel_config.rank // parallel_config.tensor_parallel_size
- pp_size = parallel_config.pipeline_parallel_size
- start, end = get_pp_indices(total_num_hidden_layers, pp_rank, pp_size)
- return end - start
- def contains_seqlen_agnostic_layers(
- self, parallel_config: "ParallelConfig") -> bool:
- """True for Mamba/SSM models (Jamba)"""
- return self._get_num_seqlen_agnostic_layers(parallel_config) > 0
- def get_layers_block_type(self,
- parallel_config: "ParallelConfig") -> List[str]:
- num_layers = self.get_num_layers(parallel_config)
- if self.is_attention_free():
- assert (self.hf_config.model_type == "mamba")
- return ["mamba"] * num_layers
- # Transformers supports layers_block_type @property
- return getattr(self.hf_config, "layers_block_type",
- ["attention"] * num_layers)
- def get_num_attention_layers(self,
- parallel_config: "ParallelConfig") -> int:
- return len([
- t for t in self.get_layers_block_type(parallel_config)
- if t == "attention"
- ])
- def _get_num_seqlen_agnostic_layers(
- self, parallel_config: "ParallelConfig") -> int:
- return len([
- t for t in self.get_layers_block_type(parallel_config)
- if t != "attention"
- ])
- def get_multimodal_config(self) -> "MultiModalConfig":
- """
- Get the multimodal configuration of the model.
- Raises:
- ValueError: If the model is not multimodal.
- """
- if self.multimodal_config is None:
- raise ValueError("The model is not multimodal.")
- return self.multimodal_config
- @property
- def is_encoder_decoder_model(self) -> bool:
- """Extract the HF encoder/decoder model flag."""
- return getattr(self.hf_config, "is_encoder_decoder", False)
- @property
- def is_embedding_model(self) -> bool:
- """Extract the embedding model flag."""
- return self.embedding_mode
- class CacheConfig:
- """Configuration for the KV cache.
- Args:
- block_size: Size of a cache block in number of tokens.
- gpu_memory_utilization: Fraction of GPU memory to use for the
- Aphrodite execution.
- swap_space: Size of the CPU swap space per GPU (in GiB).
- cache_dtype: Data type for kv cache storage.
- num_gpu_blocks_override: Number of GPU blocks to use. This overrides the
- profiled num_gpu_blocks if specified. Does nothing if None.
- """
- def __init__(
- self,
- block_size: int,
- gpu_memory_utilization: float,
- swap_space: float,
- cache_dtype: str,
- is_attention_free: bool = False,
- num_gpu_blocks_override: Optional[int] = None,
- sliding_window: Optional[int] = None,
- enable_prefix_caching: bool = False,
- cpu_offload_gb: float = 0.0,
- ) -> None:
- self.block_size = block_size
- self.gpu_memory_utilization = gpu_memory_utilization
- self.swap_space_bytes = swap_space * GiB_bytes
- self.num_gpu_blocks_override = num_gpu_blocks_override
- self.cache_dtype = cache_dtype
- self.is_attention_free = is_attention_free
- self.sliding_window = sliding_window
- self.enable_prefix_caching = enable_prefix_caching
- self.cpu_offload_gb = cpu_offload_gb
- self._verify_args()
- self._verify_cache_dtype()
- self._verify_prefix_caching()
- # Will be set after profiling.
- self.num_gpu_blocks = None
- self.num_cpu_blocks = None
- def metrics_info(self):
- # convert cache_config to dict(key: str, value: str) for prometheus
- # metrics info
- return {key: str(value) for key, value in self.__dict__.items()}
- def _verify_args(self) -> None:
- if self.gpu_memory_utilization > 1.0:
- raise ValueError(
- "GPU memory utilization must be less than 1.0. Got "
- f"{self.gpu_memory_utilization}.")
- def _verify_cache_dtype(self) -> None:
- if self.cache_dtype == "auto":
- pass
- elif self.cache_dtype in ("fp8", "fp8_e4m3", "fp8_e5m2"):
- logger.info(
- "Using fp8 data type to store kv cache. It reduces the GPU "
- "memory footprint and boosts the performance. "
- "Meanwhile, it may cause accuracy drop without a proper "
- "scaling factor")
- else:
- raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}")
- def _verify_prefix_caching(self) -> None:
- if not self.enable_prefix_caching:
- return
- if self.sliding_window is not None:
- raise NotImplementedError(
- "Prefix caching is not supported with sliding window. "
- "Run with --disable-sliding-window to use prefix caching.")
-
- if self.cache_dtype == "fp8":
- capability = current_platform.get_device_capability()
- capability = capability[0] * 10 + capability[1]
- if capability < 89:
- raise NotImplementedError(
- "FP8 KV cache with prefix caching is only supported on "
- "GPUs with compute capability 8.9 or higher (e.g., "
- "4090, H100). Your GPU has compute capability "
- f"{capability}")
- if not HAS_TRITON and self.enable_prefix_caching:
- raise ValueError("Triton is not installed, "
- "prefix caching will not work.")
- def verify_with_parallel_config(
- self,
- parallel_config: "ParallelConfig",
- ) -> None:
- total_cpu_memory = get_cpu_memory()
- # FIXME: Here, it is assumed that the GPUs in a tensor parallel
- # group are in the same node. However, the GPUs may span multiple nodes.
- num_gpus_per_node = parallel_config.tensor_parallel_size
- cpu_memory_usage = self.swap_space_bytes * num_gpus_per_node
- msg = (f"{cpu_memory_usage / GiB_bytes:.2f} GiB out of the "
- f"{total_cpu_memory / GiB_bytes:.2f} GiB total CPU memory "
- "is allocated for the swap space.")
- if cpu_memory_usage > 0.7 * total_cpu_memory:
- raise ValueError("Too large swap space. " + msg)
- elif cpu_memory_usage > 0.4 * total_cpu_memory:
- logger.warning("Possibly too large swap space. " + msg)
- @dataclass
- class TokenizerPoolConfig:
- """Configuration for the tokenizer pool.
- Args:
- pool_size: Number of tokenizer workers in the pool.
- pool_type: Type of the pool.
- extra_config: Additional config for the pool.
- The way the config will be used depends on the
- pool type.
- """
- pool_size: int
- pool_type: Union[str, Type["BaseTokenizerGroup"]]
- extra_config: dict
- def __post_init__(self):
- if self.pool_type not in ("ray", ) and not isinstance(
- self.pool_type, type):
- raise ValueError(f"Unknown pool type: {self.pool_type}")
- if not isinstance(self.extra_config, dict):
- raise ValueError("extra_config must be a dictionary.")
- @classmethod
- def create_config(
- cls, tokenizer_pool_size: int, tokenizer_pool_type: str,
- tokenizer_pool_extra_config: Optional[Union[str, dict]]
- ) -> Optional["TokenizerPoolConfig"]:
- """Create a TokenizerPoolConfig from the given parameters.
- If tokenizer_pool_size is 0, return None.
- Args:
- tokenizer_pool_size: Number of tokenizer workers in the pool.
- tokenizer_pool_type: Type of the pool.
- tokenizer_pool_extra_config: Additional config for the pool.
- The way the config will be used depends on the
- pool type. This can be a JSON string (will be parsed).
- """
- if tokenizer_pool_size:
- if isinstance(tokenizer_pool_extra_config, str):
- tokenizer_pool_extra_config_parsed = json.loads(
- tokenizer_pool_extra_config)
- else:
- tokenizer_pool_extra_config_parsed = (
- tokenizer_pool_extra_config or {})
- tokenizer_pool_config = cls(tokenizer_pool_size,
- tokenizer_pool_type,
- tokenizer_pool_extra_config_parsed)
- else:
- tokenizer_pool_config = None
- return tokenizer_pool_config
- class LoadFormat(str, enum.Enum):
- AUTO = "auto"
- PT = "pt"
- SAFETENSORS = "safetensors"
- NPCACHE = "npcache"
- DUMMY = "dummy"
- TENSORIZER = "tensorizer"
- SHARDED_STATE = "sharded_state"
- GGUF = "gguf"
- BITSANDBYTES = "bitsandbytes"
- MISTRAL = "mistral"
- @dataclass
- class LoadConfig:
- """
- download_dir: Directory to download and load the weights, default to the
- default cache directory of huggingface.
- load_format: The format of the model weights to load:
- "auto" will try to load the weights in the safetensors format and
- fall back to the pytorch bin format if safetensors format is
- not available.
- "pt" will load the weights in the pytorch bin format.
- "safetensors" will load the weights in the safetensors format.
- "npcache" will load the weights in pytorch format and store
- a numpy cache to speed up the loading.
- "dummy" will initialize the weights with random values, which is
- mainly for profiling.
- "tensorizer" will use CoreWeave's tensorizer library for
- fast weight loading.
- ignore_patterns: The list of patterns to ignore when loading the model.
- Default to "original/**/*" to avoid repeated loading of llama's
- checkpoints.
- """
- load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO
- download_dir: Optional[str] = None
- model_loader_extra_config: Optional[Union[str, dict]] = field(
- default_factory=dict)
- ignore_patterns: Optional[Union[List[str], str]] = None
- def __post_init__(self):
- model_loader_extra_config = self.model_loader_extra_config or {}
- if isinstance(model_loader_extra_config, str):
- self.model_loader_extra_config = json.loads(
- model_loader_extra_config)
- self._verify_load_format()
- if self.ignore_patterns is not None and len(self.ignore_patterns) > 0:
- logger.info(
- "Ignoring the following patterns when downloading weights: "
- f"{self.ignore_patterns}")
- else:
- self.ignore_patterns = ["original/**/*"]
- def _verify_load_format(self) -> None:
- if not isinstance(self.load_format, str):
- return
- load_format = self.load_format.lower()
- self.load_format = LoadFormat(load_format)
- rocm_not_supported_load_format: List[str] = []
- if is_hip() and load_format in rocm_not_supported_load_format:
- rocm_supported_load_format = [
- f for f in LoadFormat.__members__
- if (f not in rocm_not_supported_load_format)
- ]
- raise ValueError(
- f"load format '{load_format}' is not supported in ROCm. "
- f"Supported load formats are "
- f"{rocm_supported_load_format}")
- class ParallelConfig:
- """Configuration for the distributed execution.
- Args:
- pipeline_parallel_size: Number of pipeline parallel groups.
- tensor_parallel_size: Number of tensor parallel groups.
- worker_use_ray: Deprecated, use distributed_executor_backend instead.
- max_parallel_loading_workers: Maximum number of multiple batches
- when load model sequentially. To avoid RAM OOM when using tensor
- parallel and large models.
- disable_custom_all_reduce: Disable the custom all-reduce kernel and
- fall back to NCCL.
- tokenizer_pool_config: Config for the tokenizer pool.
- If None, will use synchronous tokenization.
- ray_workers_use_nsight: Whether to profile Ray workers with nsight, see
- https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler.
- placement_group: ray distributed model workers placement group.
- distributed_executor_backend: Backend to use for distributed model
- workers, either "ray" or "mp" (multiprocessing). If either
- pipeline_parallel_size or tensor_parallel_size is greater than 1,
- will default to "ray" if Ray is installed or "mp" otherwise.
- """
- def __init__(
- self,
- pipeline_parallel_size: int,
- tensor_parallel_size: int,
- worker_use_ray: Optional[bool] = None,
- max_parallel_loading_workers: Optional[int] = None,
- disable_custom_all_reduce: bool = False,
- tokenizer_pool_config: Optional[TokenizerPoolConfig] = None,
- ray_workers_use_nsight: bool = False,
- placement_group: Optional["PlacementGroup"] = None,
- distributed_executor_backend: Optional[Union[
- str, Type["ExecutorBase"]]] = None,
- ) -> None:
- self.pipeline_parallel_size = pipeline_parallel_size
- self.tensor_parallel_size = tensor_parallel_size
- self.distributed_executor_backend = distributed_executor_backend
- self.max_parallel_loading_workers = max_parallel_loading_workers
- self.disable_custom_all_reduce = disable_custom_all_reduce
- self.tokenizer_pool_config = tokenizer_pool_config
- self.ray_workers_use_nsight = ray_workers_use_nsight
- self.placement_group = placement_group
- self.world_size = pipeline_parallel_size * self.tensor_parallel_size
- if worker_use_ray:
- if self.distributed_executor_backend is None:
- self.distributed_executor_backend = "ray"
- elif not self.use_ray:
- raise ValueError(f"worker-use-ray can't be used with "
- f"distributed executor backend "
- f"'{self.distributed_executor_backend}'.")
- if self.distributed_executor_backend is None and self.world_size > 1:
- # We use multiprocessing by default if world_size fits on the
- # current node and we aren't in a ray placement group.
- from aphrodite.executor import ray_utils
- backend = "mp"
- ray_found = ray_utils.ray_is_available()
- if cuda_device_count_stateless() < self.world_size:
- if not ray_found:
- raise ValueError("Unable to load Ray which is "
- "required for multi-node inference, "
- "please install Ray with `pip install "
- "ray`.") from ray_utils.ray_import_err
- backend = "ray"
- elif ray_found:
- if self.placement_group:
- backend = "ray"
- else:
- from ray import is_initialized as ray_is_initialized
- if ray_is_initialized():
- from ray.util import get_current_placement_group
- if get_current_placement_group():
- backend = "ray"
- self.distributed_executor_backend = backend
- logger.info(
- f"Defaulting to use {backend} for distributed inference.")
- self._verify_args()
- self.rank = 0
- @property
- def use_ray(self) -> bool:
- return self.distributed_executor_backend == "ray" or (
- isinstance(self.distributed_executor_backend, type)
- and self.distributed_executor_backend.uses_ray)
- def _verify_args(self) -> None:
- # Lazy import to avoid circular import
- from aphrodite.executor.executor_base import ExecutorBase
- if self.distributed_executor_backend not in (
- "ray", "mp", None) and not (isinstance(
- self.distributed_executor_backend, type) and issubclass(
- self.distributed_executor_backend, ExecutorBase)):
- raise ValueError(
- "Unrecognized distributed executor backend "
- f"{self.distributed_executor_backend}. Supported "
- "values are 'ray', 'mp' or custom ExecutorBase subclass.")
- if self.use_ray:
- from aphrodite.executor import ray_utils
- ray_utils.assert_ray_available()
- if is_hip():
- self.disable_custom_all_reduce = True
- logger.info(
- "Disabled the custom all-reduce kernel because it is not "
- "supported on AMD GPUs.")
- if self.ray_workers_use_nsight and not self.use_ray:
- raise ValueError("Unable to use nsight profiling unless workers "
- "run with Ray.")
- class SchedulerConfig:
- """Scheduler configuration.
- Args:
- max_num_batched_tokens: Maximum number of tokens to be processed in
- a single iteration.
- max_num_seqs: Maximum number of sequences to be processed in a single
- iteration.
- max_model_len: Maximum length of a sequence (including prompt
- and generated text).
- is_attention_free: True if the running model does not have state that
- grows as the context size increases.
- use_v2_block_manager: Whether to use the BlockSpaceManagerV2 or not.
- num_lookahead_slots: The number of slots to allocate per sequence per
- step, beyond the known token ids. This is used in speculative
- decoding to store KV activations of tokens which may or may not be
- accepted.
- delay_factor: Apply a delay (of delay factor multiplied by previous
- prompt latency) before scheduling next prompt.
- enable_chunked_prefill: If True, prefill requests can be chunked based
- on the remaining max_num_batched_tokens.
- embedding_mode: Whether the running model is for embedding.
- preemption_mode: Whether to perform preemption by swapping or
- recomputation. If not specified, we determine the mode as follows:
- We use recomputation by default since it incurs lower overhead than
- swapping. However, when the sequence group has multiple sequences
- (e.g., beam search), recomputation is not currently supported. In
- such a case, we use swapping instead.
- send_delta_data: Private API. If used, scheduler sends delta data to
- workers instead of an entire data. It should be enabled only
- when SPMD worker architecture is enabled. I.e.,
- APHRODITE_USE_RAY_SPMD_WORKER=1
- """
- def __init__(self,
- max_num_batched_tokens: Optional[int],
- max_num_seqs: int,
- max_model_len: int,
- is_attention_free: bool = False,
- use_v2_block_manager: bool = False,
- num_lookahead_slots: int = 0,
- delay_factor: float = 0.0,
- enable_chunked_prefill: bool = False,
- embedding_mode: Optional[bool] = False,
- preemption_mode: Optional[str] = None,
- num_scheduler_steps: int = 1,
- send_delta_data: bool = False) -> None:
- if max_num_batched_tokens is not None:
- self.max_num_batched_tokens = max_num_batched_tokens
- else:
- if enable_chunked_prefill:
- if not HAS_TRITON:
- raise ValueError("Triton is not installed, "
- "chunked prefill will not work.")
- # For chunked prefill, choose the well-tuned batch size.
- self.max_num_batched_tokens = 768
- elif embedding_mode:
- # For embedding, choose specific value for higher throughput
- self.max_num_batched_tokens = max(
- max_model_len, _EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS)
- else:
- # If max_model_len is too short, use 2048 as the default value
- # for higher throughput.
- self.max_num_batched_tokens = max(max_model_len, 2048)
- if enable_chunked_prefill:
- logger.info(
- "Chunked prefill is enabled with "
- f"max_num_batched_tokens={self.max_num_batched_tokens}.")
- self.max_num_seqs = max_num_seqs
- self.max_model_len = max_model_len
- self.is_attention_free = is_attention_free
- self.use_v2_block_manager = use_v2_block_manager
- self.num_lookahead_slots = num_lookahead_slots
- self.delay_factor = delay_factor
- self.chunked_prefill_enabled = enable_chunked_prefill
- self.embedding_mode = embedding_mode
- self.preemption_mode = preemption_mode
- self.num_scheduler_steps = num_scheduler_steps
- self.send_delta_data = send_delta_data
- self._verify_args()
- def _verify_args(self) -> None:
- if (self.max_num_batched_tokens < self.max_model_len
- and not self.chunked_prefill_enabled):
- raise ValueError(
- f"max_num_batched_tokens ({self.max_num_batched_tokens}) is "
- f"smaller than max_model_len ({self.max_model_len}). "
- "This effectively limits the maximum sequence length to "
- "max_num_batched_tokens and makes Aphrodite reject longer "
- "sequences. Please increase max_num_batched_tokens or "
- "decrease max_model_len.")
- if self.max_num_batched_tokens < self.max_num_seqs:
- raise ValueError(
- f"max_num_batched_tokens ({self.max_num_batched_tokens}) must "
- "be greater than or equal to max_num_seqs "
- f"({self.max_num_seqs}).")
- if self.num_lookahead_slots < 0:
- raise ValueError(
- "num_lookahead_slots "
- f"({self.num_lookahead_slots}) must be greater than or "
- "equal to 0.")
- if self.num_scheduler_steps < 1:
- raise ValueError(
- "num_scheduler_steps "
- f"({self.num_scheduler_steps}) must be greater than or "
- "equal to 1.")
- @property
- def is_multi_step(self) -> bool:
- return self.num_scheduler_steps > 1
- class DeviceConfig:
- def __init__(self, device: str = "auto") -> None:
- if device == "auto":
- # Automated device type detection
- if is_neuron():
- self.device_type = "neuron"
- elif is_openvino():
- self.device_type = "openvino"
- elif current_platform.is_tpu():
- self.device_type = "tpu"
- elif is_cpu():
- self.device_type = "cpu"
- elif is_xpu():
- self.device_type = "xpu"
- else:
- # We don't call torch.cuda.is_available() here to
- # avoid initializing CUDA before workers are forked
- self.device_type = "cuda"
- else:
- # Device type is assigned explicitly
- self.device_type = device
- # Some device types require processing inputs on CPU
- if self.device_type in ["neuron", "openvino"]:
- self.device = torch.device("cpu")
- elif self.device_type in ["tpu"]:
- self.device = None
- else:
- # Set device with device type
- self.device = torch.device(self.device_type)
- class SpeculativeConfig:
- """Configuration for speculative decoding.
- The configuration is currently specialized to draft-model speculative
- decoding with top-1 proposals.
- """
- @staticmethod
- def maybe_create_spec_config(
- target_model_config: ModelConfig,
- target_parallel_config: ParallelConfig,
- target_dtype: str,
- speculative_model: Optional[str],
- speculative_model_quantization: Optional[str],
- speculative_draft_tensor_parallel_size: Optional[int],
- num_speculative_tokens: Optional[int],
- speculative_max_model_len: Optional[int],
- enable_chunked_prefill: bool,
- use_v2_block_manager: bool,
- disable_log_stats: bool,
- speculative_disable_by_batch_size: Optional[int],
- ngram_prompt_lookup_max: Optional[int],
- ngram_prompt_lookup_min: Optional[int],
- draft_token_acceptance_method: str,
- typical_acceptance_sampler_posterior_threshold: Optional[float],
- typical_acceptance_sampler_posterior_alpha: Optional[float],
- disable_logprobs: Optional[bool],
- ) -> Optional["SpeculativeConfig"]:
- """Create a SpeculativeConfig if possible, else return None.
- This function attempts to create a SpeculativeConfig object based on the
- provided parameters. If the necessary conditions are met, it returns an
- instance of SpeculativeConfig. Otherwise, it returns None.
- Args:
- target_model_config (ModelConfig): The configuration of the target
- model.
- target_parallel_config (ParallelConfig): The parallel configuration
- for the target model.
- target_dtype (str): The data type used for the target model.
- speculative_model (Optional[str]): The name of the speculative
- model, if provided.
- num_speculative_tokens (Optional[int]): The number of speculative
- tokens, if provided. Will default to the number in the draft
- model config if present, otherwise is required.
- speculative_model_quantization (Optional[str]): Quantization method
- that was used to quantize the speculative model weights. If
- None, we assume the model weights are not quantized.
- speculative_draft_tensor_parallel_size (Optional[int]): The degree
- of the tensor parallelism for the draft model.
- speculative_max_model_len (Optional[int]): The maximum model len of
- the speculative model. Used when testing the ability to skip
- speculation for some sequences.
- enable_chunked_prefill (bool): Whether Aphrodite is configured to
- use chunked prefill or not. Used for raising an error since its
- not yet compatible with spec decode.
- use_v2_block_manager (bool): Whether Aphrodite is configured to
- use the v2 block manager or not. Used for raising an error
- since the v2 block manager is required with spec decode.
- speculative_disable_by_batch_size (Optional[int]): Disable
- speculative decoding for new incoming requests when the number
- of enqueue requests is larger than this value, if provided.
- ngram_prompt_lookup_max (Optional[int]): Max size of ngram token
- window, if provided.
- ngram_prompt_lookup_min (Optional[int]): Min size of ngram token
- window, if provided.
- draft_token_acceptance_method (str): The method to use for
- accepting draft tokens. This can take two possible
- values 'rejection_sampler' and 'typical_acceptance_sampler'
- for RejectionSampler and TypicalAcceptanceSampler
- respectively.
- typical_acceptance_sampler_posterior_threshold (Optional[float]):
- A threshold value that sets a lower bound on the posterior
- probability of a token in the target model for it to be
- accepted. This threshold is used only when we use the
- TypicalAcceptanceSampler for token acceptance.
- typical_acceptance_sampler_posterior_alpha (Optional[float]):
- A scaling factor for the entropy-based threshold in the
- TypicalAcceptanceSampler.
- disable_logprobs (Optional[bool]): If set to True, token log
- probabilities are not returned during speculative decoding.
- If set to False, token log probabilities are returned
- according to the log probability settings in SamplingParams.
- If not specified, it defaults to True.
- Returns:
- Optional["SpeculativeConfig"]: An instance of SpeculativeConfig if
- the necessary conditions are met, else None.
- """
- if speculative_model is None:
- if num_speculative_tokens is not None:
- raise ValueError("num_speculative_tokens was provided without "
- "speculative_model.")
- return None
- if (speculative_disable_by_batch_size is not None
- and speculative_disable_by_batch_size < 2):
- raise ValueError("Expected the batch size threshold of disabling "
- "speculative decoding is > 1, but got "
- f"{speculative_disable_by_batch_size=}")
- if enable_chunked_prefill:
- raise ValueError(
- "Speculative decoding and chunked prefill are "
- f"currently mutually exclusive ({enable_chunked_prefill=}).")
- if not use_v2_block_manager:
- raise ValueError(
- "Speculative decoding requires usage of the V2 "
- "block manager. Enable it with --use-v2-block-manager.")
- # TODO: The user should be able to specify revision/max model len
- # for the draft model. It is not currently supported.
- draft_revision = None
- draft_code_revision = None
- draft_quantization = speculative_model_quantization
- if speculative_model == "[ngram]":
- if ngram_prompt_lookup_min is None:
- ngram_prompt_lookup_min = 1
- if ngram_prompt_lookup_max is None or ngram_prompt_lookup_max < 1:
- raise ValueError(f"{ngram_prompt_lookup_max=} must be > 0")
- if ngram_prompt_lookup_min < 1:
- raise ValueError(f"{ngram_prompt_lookup_min=} must be > 0")
- if ngram_prompt_lookup_min > ngram_prompt_lookup_max:
- raise ValueError(f"{ngram_prompt_lookup_min=} cannot be "
- f"larger than {ngram_prompt_lookup_max=}")
- # TODO: current we still need extract vocab_size from target model
- # config, in future, we may try refactoring it out, and set
- # draft related config as None here.
- draft_model_config = target_model_config
- draft_parallel_config = target_parallel_config
- else:
- ngram_prompt_lookup_max = 0
- ngram_prompt_lookup_min = 0
- draft_model_config = ModelConfig(
- model=speculative_model,
- tokenizer=target_model_config.tokenizer,
- tokenizer_mode=target_model_config.tokenizer_mode,
- trust_remote_code=target_model_config.trust_remote_code,
- dtype=target_model_config.dtype,
- seed=target_model_config.seed,
- revision=draft_revision,
- code_revision=draft_code_revision,
- tokenizer_revision=target_model_config.tokenizer_revision,
- max_model_len=None,
- spec_target_max_model_len=target_model_config.max_model_len,
- quantization=draft_quantization,
- enforce_eager=target_model_config.enforce_eager,
- max_seq_len_to_capture=target_model_config.
- max_seq_len_to_capture,
- max_logprobs=target_model_config.max_logprobs,
- )
- draft_hf_config = draft_model_config.hf_config
- if (num_speculative_tokens is not None
- and hasattr(draft_hf_config, "num_lookahead_tokens")):
- draft_hf_config.num_lookahead_tokens = num_speculative_tokens
- n_predict = getattr(draft_hf_config, "n_predict", None)
- if n_predict is not None:
- if num_speculative_tokens is None:
- # Default to max value defined in draft model config.
- num_speculative_tokens = n_predict
- elif num_speculative_tokens > n_predict:
- # Verify provided value doesn't exceed the maximum
- # supported by the draft model.
- raise ValueError(
- "This speculative model supports a maximum of "
- f"num_speculative_tokens={n_predict}, but "
- f"{num_speculative_tokens=} was provided.")
- draft_model_config.max_model_len = (
- SpeculativeConfig._maybe_override_draft_max_model_len(
- speculative_max_model_len,
- draft_model_config.max_model_len,
- target_model_config.max_model_len,
- ))
- draft_parallel_config = (
- SpeculativeConfig.create_draft_parallel_config(
- target_parallel_config,
- speculative_draft_tensor_parallel_size))
- if num_speculative_tokens is None:
- raise ValueError(
- "num_speculative_tokens must be provided with "
- "speculative_model unless the draft model config contains an "
- "n_predict parameter.")
- if typical_acceptance_sampler_posterior_threshold is None:
- typical_acceptance_sampler_posterior_threshold = 0.09
- if typical_acceptance_sampler_posterior_alpha is None:
- typical_acceptance_sampler_posterior_alpha = 0.3
- if disable_logprobs is None:
- disable_logprobs = True
- return SpeculativeConfig(
- draft_model_config,
- draft_parallel_config,
- num_speculative_tokens,
- speculative_disable_by_batch_size,
- ngram_prompt_lookup_max,
- ngram_prompt_lookup_min,
- draft_token_acceptance_method=draft_token_acceptance_method,
- typical_acceptance_sampler_posterior_threshold=\
- typical_acceptance_sampler_posterior_threshold,
- typical_acceptance_sampler_posterior_alpha=\
- typical_acceptance_sampler_posterior_alpha,
- disable_logprobs=disable_logprobs,
- disable_log_stats=disable_log_stats,
- )
- @staticmethod
- def _maybe_override_draft_max_model_len(
- speculative_max_model_len: Optional[int],
- draft_max_model_len: int,
- target_max_model_len: int,
- ) -> int:
- """Determine the max sequence len for the draft model. This is usually
- the draft_max_model_len, but may be the target_max_model_len if it is
- less than the draft_max_model_len, or may be speculative_max_model_len
- if it is specified.
- This is necessary so that sequences do not exceed the capacity of the
- draft model or the target model.
- speculative_max_model_len is mainly used for testing that sequences can
- skip speculation.
- """
- if speculative_max_model_len is not None:
- if speculative_max_model_len > draft_max_model_len:
- raise ValueError(f"{speculative_max_model_len=} cannot be "
- f"larger than {draft_max_model_len=}")
- if speculative_max_model_len > target_max_model_len:
- raise ValueError(f"{speculative_max_model_len=} cannot be "
- f"larger than {target_max_model_len=}")
- return speculative_max_model_len
- return min(
- draft_max_model_len,
- target_max_model_len,
- )
- @staticmethod
- def create_draft_parallel_config(
- target_parallel_config: ParallelConfig,
- speculative_draft_tensor_parallel_size: Optional[int]
- ) -> ParallelConfig:
- """Create a parallel config for use by the draft worker.
- This is mostly a copy of the target parallel config, except the tp_size.
- """
- if speculative_draft_tensor_parallel_size is None:
- speculative_draft_tensor_parallel_size = \
- target_parallel_config.tensor_parallel_size
- elif speculative_draft_tensor_parallel_size != 1:
- # TODO: allow tp values larger than 1
- raise ValueError(
- f"{speculative_draft_tensor_parallel_size=} cannot be "
- f"other value than 1")
- draft_parallel_config = ParallelConfig(
- pipeline_parallel_size=target_parallel_config.
- pipeline_parallel_size,
- tensor_parallel_size=speculative_draft_tensor_parallel_size,
- distributed_executor_backend=target_parallel_config.
- distributed_executor_backend,
- max_parallel_loading_workers=target_parallel_config.
- max_parallel_loading_workers,
- disable_custom_all_reduce=target_parallel_config.
- disable_custom_all_reduce,
- tokenizer_pool_config=target_parallel_config.tokenizer_pool_config,
- ray_workers_use_nsight=target_parallel_config.
- ray_workers_use_nsight,
- placement_group=target_parallel_config.placement_group,
- )
- return draft_parallel_config
- def __init__(
- self,
- draft_model_config: ModelConfig,
- draft_parallel_config: ParallelConfig,
- num_speculative_tokens: int,
- speculative_disable_by_batch_size: Optional[int],
- ngram_prompt_lookup_max: Optional[int],
- ngram_prompt_lookup_min: Optional[int],
- draft_token_acceptance_method: str,
- typical_acceptance_sampler_posterior_threshold: float,
- typical_acceptance_sampler_posterior_alpha: float,
- disable_logprobs: bool,
- disable_log_stats: bool,
- ):
- """Create a SpeculativeConfig object.
- Args:
- draft_model_config: ModelConfig for the draft model.
- draft_parallel_config: ParallelConfig for the draft model.
- num_speculative_tokens: The number of tokens to sample from the
- draft model before scoring with the target model.
- speculative_disable_by_batch_size: Disable speculative
- decoding for new incoming requests when the number of
- enqueue requests is larger than this value.
- ngram_prompt_lookup_max: Max size of ngram token window.
- ngram_prompt_lookup_min: Min size of ngram token window.
- draft_token_acceptance_method (str): The method to use for
- accepting draft tokens. This can take two possible
- values 'rejection_sampler' and 'typical_acceptance_sampler'
- for RejectionSampler and TypicalAcceptanceSampler
- respectively.
- typical_acceptance_sampler_posterior_threshold (Optional[float]):
- A threshold value that sets a lower bound on the posterior
- probability of a token in the target model for it to be
- accepted. This threshold is used only when we use the
- TypicalAcceptanceSampler for token acceptance.
- typical_acceptance_sampler_posterior_alpha (Optional[float]):
- A scaling factor for the entropy-based threshold in the
- TypicalAcceptanceSampler.
- disable_logprobs: If set to True, token log probabilities will not
- be returned even if requested by sampling parameters. This
- reduces latency by skipping logprob calculation in proposal
- sampling, target sampling, and after accepted tokens are
- determined. If set to False, log probabilities will be
- returned.
- disable_log_stats: Whether to disable periodic printing of stage
- times in speculative decoding.
- """
- self.draft_model_config = draft_model_config
- self.draft_parallel_config = draft_parallel_config
- self.num_speculative_tokens = num_speculative_tokens
- self.speculative_disable_by_batch_size = \
- speculative_disable_by_batch_size
- self.ngram_prompt_lookup_max = ngram_prompt_lookup_max or 0
- self.ngram_prompt_lookup_min = ngram_prompt_lookup_min or 0
- self.draft_token_acceptance_method = draft_token_acceptance_method
- self.typical_acceptance_sampler_posterior_threshold = \
- typical_acceptance_sampler_posterior_threshold
- self.typical_acceptance_sampler_posterior_alpha = \
- typical_acceptance_sampler_posterior_alpha
- self.disable_logprobs = disable_logprobs
- self.disable_log_stats = disable_log_stats
- self._verify_args()
- def _verify_args(self) -> None:
- if self.num_speculative_tokens <= 0:
- raise ValueError("Expected num_speculative_tokens to be greater "
- f"than zero ({self.num_speculative_tokens}).")
- if self.draft_model_config:
- self.draft_model_config.verify_with_parallel_config(
- self.draft_parallel_config)
- # Validate and set draft token acceptance related settings.
- if (self.draft_token_acceptance_method is None):
- raise ValueError("draft_token_acceptance_method is not set. "
- "Expected values are rejection_sampler or "
- "typical_acceptance_sampler.")
- if (self.draft_token_acceptance_method != 'rejection_sampler'
- and self.draft_token_acceptance_method !=
- 'typical_acceptance_sampler'):
- raise ValueError(
- "Expected draft_token_acceptance_method to be either "
- "rejection_sampler or typical_acceptance_sampler. Instead it "
- f"is {self.draft_token_acceptance_method}")
- if (self.typical_acceptance_sampler_posterior_threshold < 0
- or self.typical_acceptance_sampler_posterior_alpha < 0):
- raise ValueError(
- "Expected typical_acceptance_sampler_posterior_threshold "
- "and typical_acceptance_sampler_posterior_alpha to be > 0. "
- "Instead found "
- f"typical_acceptance_sampler_posterior_threshold = "
- f"{self.typical_acceptance_sampler_posterior_threshold} and "
- f"typical_acceptance_sampler_posterior_alpha = "
- f"{self.typical_acceptance_sampler_posterior_alpha}")
- @property
- def num_lookahead_slots(self) -> int:
- """The number of additional slots the scheduler should allocate per
- step, in addition to the slots allocated for each known token.
- This is equal to the number of speculative tokens, as each speculative
- token must be scored.
- """
- return self.num_speculative_tokens
- def __repr__(self) -> str:
- if self.ngram_prompt_lookup_max > 0:
- draft_model = "[ngram]"
- else:
- draft_model = self.draft_model_config.model
- num_spec_tokens = self.num_speculative_tokens
- return f"SpeculativeConfig({draft_model=}, {num_spec_tokens=})"
- @dataclass
- class LoRAConfig:
- max_lora_rank: int
- max_loras: int
- fully_sharded_loras: bool = False
- max_cpu_loras: Optional[int] = None
- lora_dtype: Optional[torch.dtype] = None
- lora_extra_vocab_size: int = 256
- # This is a constant.
- lora_vocab_padding_size: ClassVar[int] = 256
- long_lora_scaling_factors: Optional[Tuple[float]] = None
- def __post_init__(self):
- # Setting the maximum rank to 256 should be able to satisfy the vast
- # majority of applications.
- possible_max_ranks = (8, 16, 32, 64, 128, 256)
- possible_lora_extra_vocab_size = (0, 256, 512)
- if self.max_lora_rank not in possible_max_ranks:
- raise ValueError(
- f"max_lora_rank ({self.max_lora_rank}) must be one of "
- f"{possible_max_ranks}.")
- if self.lora_extra_vocab_size not in possible_lora_extra_vocab_size:
- raise ValueError(
- f"lora_extra_vocab_size ({self.lora_extra_vocab_size}) "
- f"must be one of {possible_lora_extra_vocab_size}.")
- if self.max_loras < 1:
- raise ValueError(f"max_loras ({self.max_loras}) must be >= 1.")
- if self.max_cpu_loras is None:
- self.max_cpu_loras = self.max_loras
- elif self.max_cpu_loras < self.max_loras:
- raise ValueError(
- f"max_cpu_loras ({self.max_cpu_loras}) must be >= "
- f"max_loras ({self.max_loras})")
- def verify_with_model_config(self, model_config: ModelConfig):
- if self.lora_dtype in (None, "auto"):
- self.lora_dtype = model_config.dtype
- elif isinstance(self.lora_dtype, str):
- self.lora_dtype = getattr(torch, self.lora_dtype)
- if model_config.quantization and model_config.quantization not in [
- "awq", "gptq"
- ]:
- # TODO support all other quants
- logger.warning(f"{model_config.quantization} quantization is not "
- "tested with LoRA yet.")
- def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
- if scheduler_config.chunked_prefill_enabled:
- logger.warning(
- "Chunked Prefill with LoRA is not rigorously tested.")
- def verify_with_parallel_config(self, parallel_config: ParallelConfig):
- if self.lora_vocab_padding_size % parallel_config.world_size != 0:
- raise ValueError("LoRA vocab padding size must be divisible "
- "by world size.")
- @dataclass
- class PromptAdapterConfig:
- max_prompt_adapters: int
- max_prompt_adapter_token: int
- max_cpu_prompt_adapters: Optional[int] = None
- prompt_adapter_dtype: Optional[torch.dtype] = None
- def __post_init__(self):
- library_name = 'peft'
- try:
- __import__(library_name)
- except ImportError as e:
- raise ImportError(
- f"'{library_name}' is not installed for prompt adapter support."
- f"Please install it using 'pip install {library_name}'."
- ) from e
- if self.max_prompt_adapters < 1:
- raise ValueError(f"max_prompt_adapters "
- f"({self.max_prompt_adapters}) must be >= 1.")
- if self.max_prompt_adapter_token == 0:
- raise ValueError("max_prompt_adapter_token must be set.")
- if self.max_cpu_prompt_adapters is None:
- self.max_cpu_prompt_adapters = self.max_prompt_adapters
- def verify_with_model_config(self, model_config: ModelConfig):
- if self.prompt_adapter_dtype in (None, "auto"):
- self.prompt_adapter_dtype = model_config.dtype
- elif isinstance(self.prompt_adapter_dtype, str):
- self.prompt_adapter_dtype = getattr(torch,
- self.prompt_adapter_dtype)
- @dataclass
- class MultiModalConfig:
- """Controls the behavior of multimodal models."""
- limit_per_prompt: Mapping[str, int] = field(default_factory=dict)
- """
- The maximum number of multi-modal input instances allowed per prompt
- for each :class:`~aphrodite.multimodal.MultiModalPlugin`.
- """
- # TODO: Add configs to init vision tower or not.
- _STR_DTYPE_TO_TORCH_DTYPE = {
- "half": torch.float16,
- "float16": torch.float16,
- "float": torch.float32,
- "float32": torch.float32,
- "bfloat16": torch.bfloat16,
- }
- _ROCM_NOT_SUPPORTED_DTYPE = ["float", "float32"]
- def _get_and_verify_dtype(
- config: PretrainedConfig,
- dtype: Union[str, torch.dtype],
- ) -> torch.dtype:
- # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
- # because config.torch_dtype can be None.
- config_dtype = getattr(config, "torch_dtype", None)
- if config_dtype is None:
- config_dtype = torch.float32
- if isinstance(dtype, str):
- dtype = dtype.lower()
- if dtype == "auto":
- if config_dtype == torch.float32:
- if config.model_type == "gemma2":
- logger.info(
- "For Gemma 2, we downcast float32 to bfloat16 instead "
- "of float16 by default. Please specify `dtype` if you "
- "want to use float16.")
- torch_dtype = torch.bfloat16
- else:
- # Following the common practice, we use float16 for float32
- # models.
- torch_dtype = torch.float16
- else:
- torch_dtype = config_dtype
- else:
- if dtype not in _STR_DTYPE_TO_TORCH_DTYPE:
- raise ValueError(f"Unknown dtype: {dtype}")
- torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype]
- elif isinstance(dtype, torch.dtype):
- torch_dtype = dtype
- else:
- raise ValueError(f"Unknown dtype: {dtype}")
- if is_hip() and torch_dtype == torch.float32:
- rocm_supported_dtypes = [
- k for k, v in _STR_DTYPE_TO_TORCH_DTYPE.items()
- if (k not in _ROCM_NOT_SUPPORTED_DTYPE)
- ]
- raise ValueError(f"dtype '{dtype}' is not supported in ROCm. "
- f"Supported dtypes are {rocm_supported_dtypes}")
- # Verify the dtype.
- if torch_dtype != config_dtype:
- if torch_dtype == torch.float32:
- # Upcasting to float32 is allowed.
- pass
- elif config_dtype == torch.float32:
- # Downcasting from float32 to float16 or bfloat16 is allowed.
- pass
- else:
- # Casting between float16 and bfloat16 is allowed with a warning.
- logger.warning(f"Casting {config_dtype} to {torch_dtype}.")
- return torch_dtype
- def _get_and_verify_max_len(
- hf_config: PretrainedConfig,
- max_model_len: Optional[int],
- disable_sliding_window: bool,
- sliding_window_len: Optional[Union[int, List[Optional[int]]]],
- rope_scaling_arg: Optional[Dict[str, Any]],
- spec_target_max_model_len: Optional[int] = None,
- ) -> int:
- """Get and verify the model's maximum length."""
- derived_max_model_len = float("inf")
- possible_keys = [
- # Cohere: needs to prioritize this over "max_position_embeddings"
- "model_max_length",
- # OPT
- "max_position_embeddings",
- # GPT-2
- "n_positions",
- # MPT
- "max_seq_len",
- # ChatGLM2
- "seq_length",
- # Command-R
- "model_max_length",
- # Others
- "max_sequence_length",
- "max_seq_length",
- "seq_len",
- ]
- # Choose the smallest "max_length" from the possible keys.
- max_len_key = None
- for key in possible_keys:
- max_len = getattr(hf_config, key, None)
- if max_len is not None:
- max_len_key = key if max_len < derived_max_model_len \
- else max_len_key
- derived_max_model_len = min(derived_max_model_len, max_len)
- # If sliding window is manually disabled, max_length should be less
- # than the sliding window length in the model config.
- if disable_sliding_window and sliding_window_len is not None:
- sliding_window_len_min = get_min_sliding_window(sliding_window_len)
- max_len_key = "sliding_window" \
- if sliding_window_len_min < derived_max_model_len else max_len_key
- derived_max_model_len = min(derived_max_model_len,
- sliding_window_len_min)
- # If none of the keys were found in the config, use a default and
- # log a warning.
- if derived_max_model_len == float("inf"):
- if max_model_len is not None:
- # If max_model_len is specified, we use it.
- return max_model_len
- if spec_target_max_model_len is not None:
- # If this is a speculative draft model, we use the max model len
- # from the target model.
- return spec_target_max_model_len
- default_max_len = 2048
- logger.warning(
- "The model's config.json does not contain any of the following "
- "keys to determine the original maximum length of the model: "
- f"{possible_keys}. Assuming the model's maximum length is "
- f"{default_max_len}.")
- derived_max_model_len = default_max_len
- rope_scaling = getattr(hf_config, "rope_scaling", None)
- if rope_scaling is not None:
- rope_type = rope_scaling.get("type", rope_scaling.get("rope_type"))
- if rope_type not in {"su", "longrope", "llama3"}:
- if disable_sliding_window:
- # TODO: Find a model that supports rope_scaling
- # with sliding window to see if this case should be allowed.
- raise NotImplementedError(
- "Disabling sliding window is not supported for models "
- "with rope_scaling. Please raise an issue so we can "
- "investigate.")
- assert "factor" in rope_scaling
- scaling_factor = rope_scaling["factor"]
- if rope_type == "yarn":
- derived_max_model_len = rope_scaling[
- "original_max_position_embeddings"]
- derived_max_model_len *= scaling_factor
- # If the user specified a max length, make sure it is smaller than the
- # derived length from the HF model config.
- if max_model_len is None:
- max_model_len = int(derived_max_model_len)
- elif max_model_len > derived_max_model_len:
- # Some models might have a separate key for specifying model_max_length
- # that will be bigger than derived_max_model_len. We compare user input
- # with model_max_length and allow this override when it's smaller.
- model_max_length = getattr(hf_config, "model_max_length", None)
- if envs.APHRODITE_DYNAMIC_ROPE_SCALING:
- scaling_factor = max_model_len / derived_max_model_len
- hf_config.rope_scaling = {"factor": scaling_factor,
- "type": "dynamic"}
- logger.info(
- "Using dynamic RoPE scaling to extend the model's max context "
- f"length from {derived_max_model_len} to {max_model_len}.")
- derived_max_model_len = max_model_len
- elif model_max_length is not None and max_model_len <= model_max_length:
- if disable_sliding_window:
- # TODO: Find a model that has model_max_length
- # with sliding window to see if this case should be allowed.
- raise NotImplementedError(
- "Disabling sliding window is not supported for models "
- "model_max_length in the config. Please raise an issue "
- "so we can investigate.")
- else:
- raise ValueError(
- f"User-specified max_model_len ({max_model_len}) is greater "
- f"than the derived max_model_len ({max_len_key}="
- f"{derived_max_model_len} or model_max_length="
- f"{model_max_length} in model's config.json). To allow "
- "greater lengths, please set the env var "
- "APHRODITE_DYNAMIC_ROPE_SCALING=1")
- return int(max_model_len)
- def get_min_sliding_window(
- sliding_window: Union[int, List[Optional[int]]]) -> int:
- if isinstance(sliding_window, list):
- return min(s for s in sliding_window if s is not None)
- return sliding_window
- def get_served_model_name(model: str,
- served_model_name: Optional[Union[str, List[str]]]):
- """
- If the input is a non-empty list, the first model_name in
- `served_model_name` is taken.
- If the input is a non-empty string, it is used directly.
- For cases where the input is either an empty string or an
- empty list, the fallback is to use `self.model`.
- """
- if not served_model_name:
- return model
- if isinstance(served_model_name, list):
- return served_model_name[0]
- return served_model_name
- @dataclass
- class DecodingConfig:
- """Dataclass which contains the decoding strategy of the engine"""
- # Which guided decoding algo to use. 'outlines' / 'lm-format-enforcer'
- guided_decoding_backend: str = 'lm-format-enforcer'
- def __post_init__(self):
- valid_guided_backends = ['outlines', 'lm-format-enforcer']
- backend = self.guided_decoding_backend
- if backend not in valid_guided_backends:
- raise ValueError(f"Invalid guided_decoding_backend '{backend},"
- f"must be one of {valid_guided_backends}")
- @dataclass(frozen=True)
- class EngineConfig:
- """Dataclass which contains all engine-related configuration. This
- simplifies passing around the distinct configurations in the codebase.
- """
- model_config: ModelConfig
- cache_config: CacheConfig
- parallel_config: ParallelConfig
- scheduler_config: SchedulerConfig
- device_config: DeviceConfig
- load_config: LoadConfig
- lora_config: Optional[LoRAConfig]
- speculative_config: Optional[SpeculativeConfig]
- decoding_config: Optional[DecodingConfig]
- prompt_adapter_config: Optional[PromptAdapterConfig]
- def __post_init__(self):
- """Verify configs are valid & consistent with each other.
- """
- self.model_config.verify_with_parallel_config(self.parallel_config)
- self.cache_config.verify_with_parallel_config(self.parallel_config)
- if self.lora_config:
- self.lora_config.verify_with_model_config(self.model_config)
- self.lora_config.verify_with_scheduler_config(
- self.scheduler_config)
- self.lora_config.verify_with_parallel_config(self.parallel_config)
- if self.prompt_adapter_config:
- self.prompt_adapter_config.verify_with_model_config(
- self.model_config)
- def to_dict(self):
- """Return the configs as a dictionary, for use in **kwargs.
- """
- return dict(
- (field.name, getattr(self, field.name)) for field in fields(self))
|