david
/
aphrodite-engine
mirror of https://github.com/PygmalionAI/aphrodite-engine


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124
							import argparse
import dataclasses
import json
from dataclasses import dataclass
from typing import (TYPE_CHECKING, Dict, List, Mapping, Optional, Tuple, Type,
                    Union)

from loguru import logger

from aphrodite.common.config import (CacheConfig, DecodingConfig, DeviceConfig,
                                     EngineConfig, LoadConfig, LoRAConfig,
                                     ModelConfig, ParallelConfig,
                                     PromptAdapterConfig, SchedulerConfig,
                                     SpeculativeConfig, TokenizerPoolConfig)
from aphrodite.common.utils import FlexibleArgumentParser, is_cpu
from aphrodite.executor.executor_base import ExecutorBase
from aphrodite.quantization import QUANTIZATION_METHODS
from aphrodite.transformers_utils.utils import check_gguf_file

if TYPE_CHECKING:
    from aphrodite.transformers_utils.tokenizer_group import BaseTokenizerGroup


def nullable_kvs(val: str) -> Optional[Mapping[str, int]]:
    if len(val) == 0:
        return None

    out_dict: Dict[str, int] = {}
    for item in val.split(","):
        try:
            key, value = item.split("=")
        except TypeError as exc:
            msg = "Each item should be in the form KEY=VALUE"
            raise ValueError(msg) from exc

        try:
            out_dict[key] = int(value)
        except ValueError as exc:
            msg = f"Failed to parse value of item {key}={value}"
            raise ValueError(msg) from exc

    return out_dict

@dataclass
class EngineArgs:
    """Arguments for Aphrodite engine."""
    # Model Options
    model: str
    seed: int = 0
    served_model_name: Optional[Union[str, List[str]]] = None
    tokenizer: Optional[str] = None
    revision: Optional[str] = None
    code_revision: Optional[str] = None
    tokenizer_revision: Optional[str] = None
    tokenizer_mode: str = "auto"
    trust_remote_code: bool = False
    download_dir: Optional[str] = None
    max_model_len: Optional[int] = None
    max_context_len_to_capture: Optional[int] = None
    max_seq_len_to_capture: int = 8192
    rope_scaling: Optional[dict] = None
    rope_theta: Optional[float] = None
    model_loader_extra_config: Optional[dict] = None
    enforce_eager: Optional[bool] = None
    skip_tokenizer_init: bool = False
    tokenizer_pool_size: int = 0
    # Note: Specifying a tokenizer pool by passing a class
    # is intended for expert use only. The API may change without
    # notice.
    tokenizer_pool_type: Union[str, Type["BaseTokenizerGroup"]] = "ray"
    tokenizer_pool_extra_config: Optional[dict] = None
    limit_mm_per_prompt: Optional[Mapping[str, int]] = None
    max_logprobs: int = 10  # OpenAI default is 5, setting to 10 because ST
    # Device Options
    device: str = "auto"
    # Load Options
    load_format: str = "auto"
    dtype: str = "auto"
    ignore_patterns: Optional[Union[str, List[str]]] = None
    # Parallel Options
    worker_use_ray: Optional[bool] = False
    tensor_parallel_size: int = 1
    pipeline_parallel_size: int = 1
    ray_workers_use_nsight: bool = False
    disable_custom_all_reduce: bool = False
    # Note: Specifying a custom executor backend by passing a class
    # is intended for expert use only. The API may change without
    # notice.
    distributed_executor_backend: Optional[Union[str,
                                                 Type[ExecutorBase]]] = None
    max_parallel_loading_workers: Optional[int] = None
    # Quantization Options
    quantization: Optional[str] = None
    quantization_param_path: Optional[str] = None
    preemption_mode: Optional[str] = None
    deepspeed_fp_bits: Optional[int] = None
    quant_llm_fp_bits: Optional[int] = None
    quant_llm_exp_bits: Optional[int] = None
    # Cache Options
    kv_cache_dtype: str = "auto"
    block_size: int = 16
    enable_prefix_caching: Optional[bool] = False
    num_gpu_blocks_override: Optional[int] = None
    disable_sliding_window: bool = False
    gpu_memory_utilization: float = 0.90
    swap_space: float = 4  # GiB
    cpu_offload_gb: float = 0  # GiB
    # Scheduler Options
    use_v2_block_manager: bool = False
    scheduler_delay_factor: float = 0.0
    enable_chunked_prefill: bool = False
    guided_decoding_backend: str = 'outlines'
    max_num_batched_tokens: Optional[int] = None
    max_num_seqs: int = 256
    num_scheduler_steps: int = 1
    # Speculative Decoding Options
    num_lookahead_slots: int = 0
    speculative_model: Optional[str] = None
    speculative_model_quantization: Optional[str] = None
    num_speculative_tokens: Optional[int] = None
    speculative_max_model_len: Optional[int] = None
    ngram_prompt_lookup_max: Optional[int] = None
    ngram_prompt_lookup_min: Optional[int] = None
    speculative_draft_tensor_parallel_size: Optional[int] = None
    speculative_disable_by_batch_size: Optional[int] = None
    spec_decoding_acceptance_method: str = 'rejection_sampler'
    typical_acceptance_sampler_posterior_threshold: Optional[float] = None
    typical_acceptance_sampler_posterior_alpha: Optional[float] = None
    disable_logprobs_during_spec_decoding: Optional[bool] = None
    # Adapter Options
    enable_lora: bool = False
    max_loras: int = 1
    max_lora_rank: int = 16
    lora_extra_vocab_size: int = 256
    lora_dtype: str = "auto"
    max_cpu_loras: Optional[int] = None
    long_lora_scaling_factors: Optional[Tuple[float]] = None
    fully_sharded_loras: bool = False
    qlora_adapter_name_or_path: Optional[str] = None
    enable_prompt_adapter: bool = False
    max_prompt_adapters: int = 1
    max_prompt_adapter_token: int = 0
    # Log Options
    disable_log_stats: bool = False

    def __post_init__(self):
        if self.tokenizer is None:
            self.tokenizer = self.model
        if is_cpu():
            self.distributed_executor_backend = None

    @staticmethod
    def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
        """Shared CLI arguments for the Aphrodite engine."""

        # Model Options
        parser.add_argument(
            "--model",
            type=str,
            default="EleutherAI/pythia-70m-deduped",
            help="Category: Model Options\n"
            "name or path of the huggingface model to use",
        )
        parser.add_argument("--seed",
                            type=int,
                            default=EngineArgs.seed,
                            help="Category: Model Options\n"
                            "random seed")
        parser.add_argument(
            "--served-model-name",
            nargs="+",
            type=str,
            default=None,
            help="Category: API Options\n"
            "The model name(s) used in the API. If multiple "
            "names are provided, the server will respond to any "
            "of the provided names. The model name in the model "
            "field of a response will be the first name in this "
            "list. If not specified, the model name will be the "
            "same as the `--model` argument. Noted that this name(s)"
            "will also be used in `model_name` tag content of "
            "prometheus metrics, if multiple names provided, metrics"
            "tag will take the first one.")
        parser.add_argument(
            "--tokenizer",
            type=str,
            default=EngineArgs.tokenizer,
            help="Category: Model Options\n"
            "name or path of the huggingface tokenizer to use",
        )
        parser.add_argument(
            "--revision",
            type=str,
            default=None,
            help="Category: Model Options\n"
            "the specific model version to use. It can be a branch "
            "name, a tag name, or a commit id. If unspecified, will use "
            "the default version.",
        )
        parser.add_argument(
            "--code-revision",
            type=str,
            default=None,
            help="Category: Model Options\n"
            "the specific revision to use for the model code on "
            "Hugging Face Hub. It can be a branch name, a tag name, or a "
            "commit id. If unspecified, will use the default version.",
        )
        parser.add_argument(
            "--tokenizer-revision",
            type=str,
            default=None,
            help="Category: Model Options\n"
            "the specific tokenizer version to use. It can be a branch "
            "name, a tag name, or a commit id. If unspecified, will use "
            "the default version.",
        )
        parser.add_argument(
            "--tokenizer-mode",
            type=str,
            default=EngineArgs.tokenizer_mode,
            choices=['auto', 'slow', 'mistral'],
            help='The tokenizer mode.\n\n* "auto" will use the '
            'fast tokenizer if available.\n* "slow" will '
            'always use the slow tokenizer. \n* '
            '"mistral" will always use the `mistral_common` tokenizer.')
        parser.add_argument(
            "--trust-remote-code",
            action="store_true",
            help="Category: Model Options\n"
            "trust remote code from huggingface",
        )
        parser.add_argument(
            "--download-dir",
            type=str,
            default=EngineArgs.download_dir,
            help="Category: Model Options\n"
            "directory to download and load the weights, "
            "default to the default cache dir of "
            "huggingface",
        )
        parser.add_argument(
            "--max-model-len",
            type=int,
            default=EngineArgs.max_model_len,
            help="Category: Model Options\n"
            "model context length. If unspecified, "
            "will be automatically derived from the model.",
        )
        parser.add_argument("--max-context-len-to-capture",
                            type=int,
                            default=EngineArgs.max_context_len_to_capture,
                            help="Category: Model Options\n"
                            "Maximum context length covered by CUDA "
                            "graphs. When a sequence has context length "
                            "larger than this, we fall back to eager mode. "
                            "(DEPRECATED. Use --max-seq_len-to-capture instead"
                            ")")
        parser.add_argument("--max-seq_len-to-capture",
                            type=int,
                            default=EngineArgs.max_seq_len_to_capture,
                            help="Category: Model Options\n"
                            "Maximum sequence length covered by CUDA "
                            "graphs. When a sequence has context length "
                            "larger than this, we fall back to eager mode.")
        parser.add_argument('--rope-scaling',
                            default=None,
                            type=json.loads,
                            help='Category: Model Options\n'
                            'RoPE scaling configuration in JSON format. '
                            'For example, {"type":"dynamic","factor":2.0}')
        parser.add_argument('--rope-theta',
                            default=None,
                            type=float,
                            help='Category: Model Options\n'
                            'RoPE theta. Use with `rope_scaling`. In '
                            'some cases, changing the RoPE theta improves the '
                            'performance of the scaled model.')
        parser.add_argument("--model-loader-extra-config",
                            type=str,
                            default=EngineArgs.model_loader_extra_config,
                            help="Category: Model Options\n"
                            "Extra config for model loader. "
                            "This will be passed to the model loader "
                            "corresponding to the chosen load_format. "
                            "This should be a JSON string that will be "
                            "parsed into a dictionary.")
        parser.add_argument(
            "--enforce-eager",
            action=StoreBoolean,
            default=EngineArgs.enforce_eager,
            nargs="?",
            const="True",
            help="Category: Model Options\n"
            "Always use eager-mode PyTorch. If False, "
            "will use eager mode and CUDA graph in hybrid "
            "for maximal performance and flexibility.",
        )
        parser.add_argument("--skip-tokenizer-init",
                            action="store_true",
                            help="Category: Model Options\n"
                            "Skip initialization of tokenizer and detokenizer")
        parser.add_argument("--tokenizer-pool-size",
                            type=int,
                            default=EngineArgs.tokenizer_pool_size,
                            help="Category: Model Options\n"
                            "Size of tokenizer pool to use for "
                            "asynchronous tokenization. If 0, will "
                            "use synchronous tokenization.")
        parser.add_argument("--tokenizer-pool-type",
                            type=str,
                            default=EngineArgs.tokenizer_pool_type,
                            help="Category: Model Options\n"
                            "The type of tokenizer pool to use for "
                            "asynchronous tokenization. Ignored if "
                            "tokenizer_pool_size is 0.")
        parser.add_argument("--tokenizer-pool-extra-config",
                            type=str,
                            default=EngineArgs.tokenizer_pool_extra_config,
                            help="Category: Model Options\n"
                            "Extra config for tokenizer pool. "
                            "This should be a JSON string that will be "
                            "parsed into a dictionary. Ignored if "
                            "tokenizer_pool_size is 0.")
        # Multimodal related configs
        parser.add_argument(
            '--limit-mm-per-prompt',
            type=nullable_kvs,
            default=EngineArgs.limit_mm_per_prompt,
            # The default value is given in
            # MultiModalRegistry.init_mm_limits_per_prompt
            help=('For each multimodal plugin, limit how many '
                  'input instances to allow for each prompt. '
                  'Expects a comma-separated list of items, '
                  'e.g.: `image=16,video=2` allows a maximum of 16 '
                  'images and 2 videos per prompt. Defaults to 1 for '
                  'each modality.'))
        parser.add_argument(
            "--max-logprobs",
            type=int,
            default=EngineArgs.max_logprobs,
            help="Category: Model Options\n"
            "maximum number of log probabilities to "
            "return.",
        )
        # Device Options
        parser.add_argument(
            "--device",
            type=str,
            default=EngineArgs.device,
            choices=[
                "auto", "cuda", "neuron", "cpu", "openvino", "tpu", "xpu"
            ],
            help=("Category: Model Options\n"
                  "Device to use for model execution."),
        )
        # Load Options
        parser.add_argument(
            '--load-format',
            type=str,
            default=EngineArgs.load_format,
            choices=[
                'auto',
                'pt',
                'safetensors',
                'npcache',
                'dummy',
                'tensorizer',
                'sharded_state',
                'bitsandbytes',
            ],
            help='Category: Model Options\n'
            'The format of the model weights to load.\n\n'
            '* "auto" will try to load the weights in the safetensors format '
            'and fall back to the pytorch bin format if safetensors format '
            'is not available.\n'
            '* "pt" will load the weights in the pytorch bin format.\n'
            '* "safetensors" will load the weights in the safetensors format.\n'
            '* "npcache" will load the weights in pytorch format and store '
            'a numpy cache to speed up the loading.\n'
            '* "dummy" will initialize the weights with random values, '
            'which is mainly for profiling.\n'
            '* "tensorizer" will load the weights using tensorizer from '
            'CoreWeave. See the Tensorize Aphrodite Model script in the '
            'Examples section for more information.\n'
            '* "bitsandbytes" will load the weights using bitsandbytes '
            'quantization.\n')
        parser.add_argument(
            '--dtype',
            type=str,
            default=EngineArgs.dtype,
            choices=[
                'auto', 'half', 'float16', 'bfloat16', 'float', 'float32'
            ],
            help='Category: Model Options\n'
            'Data type for model weights and activations.\n\n'
            '* "auto" will use FP16 precision for FP32 and FP16 models, and '
            'BF16 precision for BF16 models.\n'
            '* "half" for FP16. Recommended for AWQ quantization.\n'
            '* "float16" is the same as "half".\n'
            '* "bfloat16" for a balance between precision and range.\n'
            '* "float" is shorthand for FP32 precision.\n'
            '* "float32" for FP32 precision.')
        parser.add_argument(
            '--ignore-patterns',
            action="append",
            type=str,
            default=[],
            help="Category: Model Options\n"
            "The pattern(s) to ignore when loading the model."
            "Defaults to 'original/**/*' to avoid repeated loading of llama's "
            "checkpoints.")
        # Parallel Options
        parser.add_argument(
            '--worker-use-ray',
            action='store_true',
            help='Category: Parallel Options\n'
            'Deprecated, use --distributed-executor-backend=ray.')
        parser.add_argument(
            "--tensor-parallel-size",
            "-tp",
            type=int,
            default=EngineArgs.tensor_parallel_size,
            help="Category: Parallel Options\n"
            "number of tensor parallel replicas, i.e. the number of GPUs "
            "to use.")
        parser.add_argument(
            "--pipeline-parallel-size",
            "-pp",
            type=int,
            default=EngineArgs.pipeline_parallel_size,
            help="Category: Parallel Options\n"
            "number of pipeline stages. Currently not supported.")
        parser.add_argument(
            "--ray-workers-use-nsight",
            action="store_true",
            help="Category: Parallel Options\n"
            "If specified, use nsight to profile ray workers",
        )
        parser.add_argument(
            "--disable-custom-all-reduce",
            action="store_true",
            default=EngineArgs.disable_custom_all_reduce,
            help="Category: Model Options\n"
            "See ParallelConfig",
        )
        parser.add_argument(
            '--distributed-executor-backend',
            choices=['ray', 'mp'],
            default=EngineArgs.distributed_executor_backend,
            help='Category: Parallel Options\n'
            'Backend to use for distributed serving. When more than 1 GPU '
            'is used, will be automatically set to "ray" if installed '
            'or "mp" (multiprocessing) otherwise.')
        parser.add_argument(
            "--max-parallel-loading-workers",
            type=int,
            default=EngineArgs.max_parallel_loading_workers,
            help="Category: Parallel Options\n"
            "load model sequentially in multiple batches, "
            "to avoid RAM OOM when using tensor "
            "parallel and large models",
        )
        # Quantization Options
        parser.add_argument(
            "--quantization",
            "-q",
            type=str,
            choices=[*QUANTIZATION_METHODS, None],
            default=EngineArgs.quantization,
            help="Category: Quantization Options\n"
            "Method used to quantize the weights. If "
            "None, we first check the `quantization_config` "
            "attribute in the model config file. If that is "
            "None, we assume the model weights are not "
            "quantized and use `dtype` to determine the data "
            "type of the weights.",
        )
        parser.add_argument(
            '--quantization-param-path',
            type=str,
            default=None,
            help='Category: Quantization Options\n'
            'Path to the JSON file containing the KV cache '
            'scaling factors. This should generally be supplied, when '
            'KV cache dtype is FP8. Otherwise, KV cache scaling factors '
            'default to 1.0, which may cause accuracy issues. '
            'FP8_E5M2 (without scaling) is only supported on cuda version'
            'greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead '
            'supported for common inference criteria. ')
        parser.add_argument(
            '--preemption-mode',
            type=str,
            default=None,
            help='Category: Scheduler Options\n'
            'If \'recompute\', the engine performs preemption by block '
            'swapping; If \'swap\', the engine performs preemption by block '
            'swapping.')
        parser.add_argument("--deepspeed-fp-bits",
                            type=int,
                            default=None,
                            help="Category: Quantization Options\n"
                            "Number of floating bits to use for the deepspeed "
                            "quantization. Supported bits are: 4, 6, 8, 12.")
        parser.add_argument("--quant-llm-fp-bits",
                            type=int,
                            default=None,
                            help="Category: Quantization Options\n"
                            "Number of floating bits to use for the quant_llm "
                            "quantization. Supported bits are: 4 to 15.")
        parser.add_argument("--quant-llm-exp-bits",
                            type=int,
                            default=None,
                            help="Category: Quantization Options\n"
                            "Number of exponent bits to use for the quant_llm "
                            "quantization. Supported bits are: 1 to 5.")
        # Cache Options
        parser.add_argument(
            '--kv-cache-dtype',
            type=str,
            choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
            default=EngineArgs.kv_cache_dtype,
            help='Category: Cache Options\n'
            'Data type for kv cache storage. If "auto", will use model '
            'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
            'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
        parser.add_argument(
            "--block-size",
            type=int,
            default=EngineArgs.block_size,
            choices=[8, 16, 32, 128, 256, 512, 1024, 2048],
            help="Category: Cache Options\n"
            "token block size",
        )
        parser.add_argument(
            "--enable-prefix-caching",
            "--context-shift",
            action="store_true",
            help="Category: Cache Options\n"
            "Enable automatic prefix caching.",
        )
        parser.add_argument(
            "--num-gpu-blocks-override",
            type=int,
            default=None,
            help="Category: Cache Options Options\n"
            "If specified, ignore GPU profiling result and use this "
            "number of GPU blocks. Used for testing preemption.")
        parser.add_argument('--disable-sliding-window',
                            action='store_true',
                            help='Category: KV Cache Options\n'
                            'Disables sliding window, '
                            'capping to sliding window size')
        parser.add_argument(
            "--gpu-memory-utilization",
            "-gmu",
            type=float,
            default=EngineArgs.gpu_memory_utilization,
            help="Category: Cache Options\n"
            "The fraction of GPU memory to be used for "
            "the model executor, which can range from 0 to 1."
            "If unspecified, will use the default value of 0.9.",
        )
        parser.add_argument(
            "--swap-space",
            type=float,
            default=EngineArgs.swap_space,
            help="Category: Cache Options\n"
            "CPU swap space size (GiB) per GPU",
        )
        parser.add_argument(
            '--cpu-offload-gb',
            type=float,
            default=0,
            help='Category: Cache Options\n'
            'The space in GiB to offload to CPU, per GPU. '
            'Default is 0, which means no offloading. Intuitively, '
            'this argument can be seen as a virtual way to increase '
            'the GPU memory size. For example, if you have one 24 GB '
            'GPU and set this to 10, virtually you can think of it as '
            'a 34 GB GPU. Then you can load a 13B model with BF16 weight,'
            'which requires at least 26GB GPU memory. Note that this '
            'requires fast CPU-GPU interconnect, as part of the model is'
            'loaded from CPU memory to GPU memory on the fly in each '
            'model forward pass.')
        # Scheduler Options
        parser.add_argument("--use-v2-block-manager",
                            action="store_true",
                            help="Category: Scheduler Options\n"
                            "Use the v2 block manager.")
        parser.add_argument(
            "--scheduler-delay-factor",
            "-sdf",
            type=float,
            default=EngineArgs.scheduler_delay_factor,
            help="Category: Scheduler Options\n"
            "Apply a delay (of delay factor multiplied by previous "
            "prompt latency) before scheduling next prompt.")
        parser.add_argument(
            "--enable-chunked-prefill",
            action=StoreBoolean,
            default=EngineArgs.enable_chunked_prefill,
            nargs="?",
            const="True",
            help="Category: Scheduler Options\n"
            "If True, the prefill requests can be chunked based on the "
            "max_num_batched_tokens.")
        parser.add_argument(
            '--guided-decoding-backend',
            type=str,
            default='outlines',
            choices=['outlines', 'lm-format-enforcer'],
            help='Category: Scheduler Options\n'
            'Which engine will be used for guided decoding'
            ' (JSON schema / regex etc) by default. Currently support '
            'https://github.com/outlines-dev/outlines and '
            'https://github.com/noamgat/lm-format-enforcer.'
            ' Can be overridden per request via guided_decoding_backend'
            ' parameter.')
        parser.add_argument(
            "--max-num-batched-tokens",
            type=int,
            default=EngineArgs.max_num_batched_tokens,
            help="Category: KV Cache Options\n"
            "maximum number of batched tokens per "
            "iteration",
        )
        parser.add_argument(
            "--max-num-seqs",
            type=int,
            default=EngineArgs.max_num_seqs,
            help="Category: API Options\n"
            "maximum number of sequences per iteration",
        )
        parser.add_argument('--num-scheduler-steps',
                            type=int,
                            default=1,
                            help=('Maximum number of forward steps per '
                                  'scheduler call.'))
        # Speculative Decoding Options
        parser.add_argument("--num-lookahead-slots",
                            type=int,
                            default=EngineArgs.num_lookahead_slots,
                            help="Category: Speculative Decoding Options\n"
                            "Experimental scheduling config necessary for "
                            "speculative decoding. This will be replaced by "
                            "speculative decoding config in the future; it is "
                            "present for testing purposes until then.")

        parser.add_argument(
            "--speculative-model",
            type=str,
            default=EngineArgs.speculative_model,
            help="Category: Speculative Decoding Options\n"
            "The name of the draft model to be used in speculative decoding.")
        # Quantization settings for speculative model.
        parser.add_argument(
            '--speculative-model-quantization',
            type=str,
            choices=[*QUANTIZATION_METHODS, None],
            default=EngineArgs.speculative_model_quantization,
            help='Method used to quantize the weights of speculative model.'
            'If None, we first check the `quantization_config` '
            'attribute in the model config file. If that is '
            'None, we assume the model weights are not '
            'quantized and use `dtype` to determine the data '
            'type of the weights.')
        parser.add_argument("--num-speculative-tokens",
                            type=int,
                            default=EngineArgs.num_speculative_tokens,
                            help="Category: Speculative Decoding Options\n"
                            "The number of speculative tokens to sample from "
                            "the draft model in speculative decoding")
        parser.add_argument(
            "--speculative-max-model-len",
            type=str,
            default=EngineArgs.speculative_max_model_len,
            help="Category: Speculative Decoding Options\n"
            "The maximum sequence length supported by the "
            "draft model. Sequences over this length will skip "
            "speculation.")
        parser.add_argument(
            "--ngram-prompt-lookup-max",
            type=int,
            default=EngineArgs.ngram_prompt_lookup_max,
            help="Category: Speculative Decoding Options\n"
            "Max size of window for ngram prompt lookup in speculative "
            "decoding.")
        parser.add_argument(
            "--ngram-prompt-lookup-min",
            type=int,
            default=EngineArgs.ngram_prompt_lookup_min,
            help="Category: Speculative Decoding Options\n"
            "Min size of window for ngram prompt lookup in speculative "
            "decoding.")

        parser.add_argument(
            "--speculative-draft-tensor-parallel-size",
            "-spec-draft-tp",
            type=int,
            default=EngineArgs.speculative_draft_tensor_parallel_size,
            help="Category: Speculative Decoding Options\n"
            "Number of tensor parallel replicas for "
            "the draft model in speculative decoding.")
        parser.add_argument(
            "--speculative-disable-by-batch-size",
            type=int,
            default=EngineArgs.speculative_disable_by_batch_size,
            help="Category: Speculative Decoding Options\n"
            "Disable speculative decoding for new incoming requests "
            "if the number of enqueue requests is larger than this value.")
        parser.add_argument(
            '--spec-decoding-acceptance-method',
            type=str,
            default=EngineArgs.spec_decoding_acceptance_method,
            choices=['rejection_sampler', 'typical_acceptance_sampler'],
            help='Category: Speculative Decoding Options\n'
            'Specify the acceptance method to use during draft token '
            'verification in speculative decoding. Two types of acceptance '
            'routines are supported: '
            '1) RejectionSampler which does not allow changing the '
            'acceptance rate of draft tokens, '
            '2) TypicalAcceptanceSampler which is configurable, allowing for '
            'a higher acceptance rate at the cost of lower quality, '
            'and vice versa.')
        parser.add_argument(
            '--typical-acceptance-sampler-posterior-threshold',
            type=float,
            default=EngineArgs.typical_acceptance_sampler_posterior_threshold,
            help='Category: Speculative Decoding Options\n'
            'Set the lower bound threshold for the posterior '
            'probability of a token to be accepted. This threshold is '
            'used by the TypicalAcceptanceSampler to make sampling decisions '
            'during speculative decoding. Defaults to 0.09')
        parser.add_argument(
            '--typical-acceptance-sampler-posterior-alpha',
            type=float,
            default=EngineArgs.typical_acceptance_sampler_posterior_alpha,
            help='Category: Speculative Decoding Options\n'
            'A scaling factor for the entropy-based threshold for token '
            'acceptance in the TypicalAcceptanceSampler. Typically defaults '
            'to sqrt of --typical-acceptance-sampler-posterior-threshold '
            'i.e. 0.3')
        parser.add_argument(
            '--disable-logprobs-during-spec-decoding',
            type=bool,
            default=EngineArgs.disable_logprobs_during_spec_decoding,
            help='Category: Speculative Decoding Options\n'
            'If set to True, token log probabilities are not returned '
            'during speculative decoding. If set to False, log probabilities '
            'are returned according to the settings in SamplingParams. If '
            'not specified, it defaults to True. Disabling log probabilities '
            'during speculative decoding reduces latency by skipping logprob '
            'calculation in proposal sampling, target sampling, and after '
            'accepted tokens are determined.')
        # Adapter Options
        parser.add_argument(
            "--enable-lora",
            action="store_true",
            help="Category: Adapter Options\n"
            "If True, enable handling of LoRA adapters.",
        )
        parser.add_argument(
            "--max-loras",
            type=int,
            default=EngineArgs.max_loras,
            help="Category: Adapter Options\n"
            "Max number of LoRAs in a single batch.",
        )
        parser.add_argument(
            "--max-lora-rank",
            type=int,
            default=EngineArgs.max_lora_rank,
            help="Category: Adapter Options\n"
            "Max LoRA rank.",
        )
        parser.add_argument(
            "--lora-extra-vocab-size",
            type=int,
            default=EngineArgs.lora_extra_vocab_size,
            help=("Category: Adapter Options\n"
                  "Maximum size of extra vocabulary that can be "
                  "present in a LoRA adapter (added to the base "
                  "model vocabulary)."),
        )
        parser.add_argument(
            "--lora-dtype",
            type=str,
            default=EngineArgs.lora_dtype,
            choices=["auto", "float16", "bfloat16", "float32"],
            help=("Category: Adapter Options\n"
                  "Data type for LoRA. If auto, will default to "
                  "base model dtype."),
        )
        parser.add_argument(
            "--max-cpu-loras",
            type=int,
            default=EngineArgs.max_cpu_loras,
            help=("Category: Adapter Options\n"
                  "Maximum number of LoRAs to store in CPU memory. "
                  "Must be >= than max_num_seqs. "
                  "Defaults to max_num_seqs."),
        )
        parser.add_argument(
            "--long-lora-scaling-factors",
            type=str,
            default=EngineArgs.long_lora_scaling_factors,
            help=("Category: Adapter Options\n"
                  "Specify multiple scaling factors (which can "
                  "be different from base model scaling factor "
                  "- see eg. Long LoRA) to allow for multiple "
                  "LoRA adapters trained with those scaling "
                  "factors to be used at the same time. If not "
                  "specified, only adapters trained with the "
                  "base model scaling factor are allowed."))
        parser.add_argument(
            "--fully-sharded-loras",
            action='store_true',
            help=("Category: Adapter Options\n"
                  "By default, only half of the LoRA computation is sharded "
                  "with tensor parallelism. Enabling this will use the fully "
                  "sharded layers. At high sequence length, max rank or "
                  "tensor parallel size, this is likely faster."))
        parser.add_argument("--qlora-adapter-name-or-path",
                            type=str,
                            default=None,
                            help="Category: Adapter Options\n"
                            "Name or path of the LoRA adapter to use.")
        parser.add_argument('--enable-prompt-adapter',
                            action='store_true',
                            help='Category: Adapter Options\n'
                            'If True, enable handling of PromptAdapters.')
        parser.add_argument('--max-prompt-adapters',
                            type=int,
                            default=EngineArgs.max_prompt_adapters,
                            help='Category: Adapter Options\n'
                            'Max number of PromptAdapters in a batch.')
        parser.add_argument('--max-prompt-adapter-token',
                            type=int,
                            default=EngineArgs.max_prompt_adapter_token,
                            help='Category: Adapter Options\n'
                            'Max number of PromptAdapters tokens')
        # Log Options
        parser.add_argument(
            "--disable-log-stats",
            action="store_true",
            help="Category: Log Options\n"
            "disable logging statistics",
        )

        return parser

    @classmethod
    def from_cli_args(cls, args: argparse.Namespace) -> "EngineArgs":
        # Get the list of attributes of this dataclass.
        attrs = [attr.name for attr in dataclasses.fields(cls)]
        # Set the attributes from the parsed arguments.
        engine_args = cls(**{attr: getattr(args, attr) for attr in attrs})
        return engine_args

    def create_engine_config(self, ) -> EngineConfig:
        # gguf file needs a specific model loader and doesn't use hf_repo
        if check_gguf_file(self.model):
            self.quantization = self.load_format = "gguf"

        # bitsandbytes quantization needs a specific model loader
        # so we make sure the quant method and the load format are consistent
        if (self.quantization == "bitsandbytes" or
            self.qlora_adapter_name_or_path is not None) and \
            self.load_format != "bitsandbytes":
            raise ValueError(
                "BitsAndBytes quantization and QLoRA adapter only support "
                f"'bitsandbytes' load format, but got {self.load_format}")

        if (self.load_format == "bitsandbytes" or
            self.qlora_adapter_name_or_path is not None) and \
            self.quantization != "bitsandbytes":
            raise ValueError(
                "BitsAndBytes load format and QLoRA adapter only support "
                f"'bitsandbytes' quantization, but got {self.quantization}")

        assert self.cpu_offload_gb >= 0, (
            "CPU offload space must be non-negative"
            f", but got {self.cpu_offload_gb}")

        device_config = DeviceConfig(device=self.device)

        model_config = ModelConfig(
            model=self.model,
            tokenizer=self.tokenizer,
            tokenizer_mode=self.tokenizer_mode,
            trust_remote_code=self.trust_remote_code,
            dtype=self.dtype,
            seed=self.seed,
            revision=self.revision,
            code_revision=self.code_revision,
            rope_scaling=self.rope_scaling,
            rope_theta=self.rope_theta,
            tokenizer_revision=self.tokenizer_revision,
            max_model_len=self.max_model_len,
            quantization=self.quantization,
            deepspeed_fp_bits=self.deepspeed_fp_bits,
            quant_llm_fp_bits=self.quant_llm_fp_bits,
            quant_llm_exp_bits=self.quant_llm_exp_bits,
            quantization_param_path=self.quantization_param_path,
            enforce_eager=self.enforce_eager,
            max_context_len_to_capture=self.max_context_len_to_capture,
            max_seq_len_to_capture=self.max_seq_len_to_capture,
            max_logprobs=self.max_logprobs,
            disable_sliding_window=self.disable_sliding_window,
            skip_tokenizer_init=self.skip_tokenizer_init,
            served_model_name=self.served_model_name,
            limit_mm_per_prompt=self.limit_mm_per_prompt,
        )

        cache_config = CacheConfig(
            block_size=self.block_size,
            gpu_memory_utilization=self.gpu_memory_utilization,
            swap_space=self.swap_space,
            cache_dtype=self.kv_cache_dtype,
            is_attention_free=model_config.is_attention_free(),
            num_gpu_blocks_override=self.num_gpu_blocks_override,
            sliding_window=model_config.get_sliding_window(),
            enable_prefix_caching=self.enable_prefix_caching,
            cpu_offload_gb=self.cpu_offload_gb,
        )

        parallel_config = ParallelConfig(
            pipeline_parallel_size=self.pipeline_parallel_size,
            tensor_parallel_size=self.tensor_parallel_size,
            worker_use_ray=self.worker_use_ray,
            max_parallel_loading_workers=self.max_parallel_loading_workers,
            disable_custom_all_reduce=self.disable_custom_all_reduce,
            tokenizer_pool_config=TokenizerPoolConfig.create_config(
                tokenizer_pool_size=self.tokenizer_pool_size,
                tokenizer_pool_type=self.tokenizer_pool_type,
                tokenizer_pool_extra_config=self.tokenizer_pool_extra_config,
            ),
            ray_workers_use_nsight=self.ray_workers_use_nsight,
            distributed_executor_backend=self.distributed_executor_backend)

        max_model_len = model_config.max_model_len
        use_long_context = max_model_len > 32768
        if self.enable_chunked_prefill is None:
            # If not explicitly set, enable chunked prefill by default for
            # long context (> 32K) models. This is to avoid OOM errors in the
            # initial memory profiling phase.
            if use_long_context:
                is_gpu = device_config.device_type == "cuda"
                use_sliding_window = (model_config.get_sliding_window()
                                      is not None)
                use_spec_decode = self.speculative_model is not None
                has_seqlen_agnostic_layers = (
                    model_config.contains_seqlen_agnostic_layers(
                        parallel_config))
                if (is_gpu and not use_sliding_window and not use_spec_decode
                        and not self.enable_lora
                        and not self.enable_prompt_adapter
                        and not self.enable_prefix_caching
                        and not has_seqlen_agnostic_layers):
                    self.enable_chunked_prefill = True
                    logger.warning(
                        "Chunked prefill is enabled by default for models with "
                        "max_model_len > 32K. Currently, chunked prefill might "
                        "not work with some features or models. If you "
                        "encounter any issues, please disable chunked prefill "
                        "by setting --enable-chunked-prefill=False.")
            if self.enable_chunked_prefill is None:
                self.enable_chunked_prefill = False

        if not self.enable_chunked_prefill and use_long_context:
            logger.warning(
                f"The model has a long context length ({max_model_len}). "
                "This may cause OOM errors during the initial memory "
                "profiling phase, or result in low performance due to small "
                "KV cache space. Consider setting --max-model-len to a "
                "smaller value.")

        speculative_config = SpeculativeConfig.maybe_create_spec_config(
            target_model_config=model_config,
            target_parallel_config=parallel_config,
            target_dtype=self.dtype,
            speculative_model=self.speculative_model,
            speculative_model_quantization = \
                self.speculative_model_quantization,
            speculative_draft_tensor_parallel_size=self.
            speculative_draft_tensor_parallel_size,
            num_speculative_tokens=self.num_speculative_tokens,
            speculative_disable_by_batch_size=self.
            speculative_disable_by_batch_size,
            speculative_max_model_len=self.speculative_max_model_len,
            enable_chunked_prefill=self.enable_chunked_prefill,
            use_v2_block_manager=self.use_v2_block_manager,
            disable_log_stats=self.disable_log_stats,
            ngram_prompt_lookup_max=self.ngram_prompt_lookup_max,
            ngram_prompt_lookup_min=self.ngram_prompt_lookup_min,
            draft_token_acceptance_method=\
                self.spec_decoding_acceptance_method,
            typical_acceptance_sampler_posterior_threshold=self.
            typical_acceptance_sampler_posterior_threshold,
            typical_acceptance_sampler_posterior_alpha=self.
            typical_acceptance_sampler_posterior_alpha,
            disable_logprobs=self.disable_logprobs_during_spec_decoding,
        )

        if self.num_scheduler_steps > 1:
            raise NotImplementedError("Multi-step is not yet supported.")
            if speculative_config is not None:
                raise ValueError("Speculative decoding is not supported with "
                                 "multi-step (--num-scheduler-steps > 1)")
            if self.enable_chunked_prefill:
                raise ValueError("Chunked prefill is not supported with "
                                 "multi-step (--num-scheduler-steps > 1)")

        # make sure num_lookahead_slots is set the higher value depending on
        # if we are using speculative decoding or multi-step
        num_lookahead_slots = max(self.num_lookahead_slots,
                                  self.num_scheduler_steps - 1)
        num_lookahead_slots = num_lookahead_slots \
            if speculative_config is None \
            else speculative_config.num_lookahead_slots

        scheduler_config = SchedulerConfig(
            max_num_batched_tokens=self.max_num_batched_tokens,
            max_num_seqs=self.max_num_seqs,
            max_model_len=model_config.max_model_len,
            is_attention_free=model_config.is_attention_free(),
            use_v2_block_manager=self.use_v2_block_manager,
            num_lookahead_slots=num_lookahead_slots,
            delay_factor=self.scheduler_delay_factor,
            enable_chunked_prefill=self.enable_chunked_prefill,
            embedding_mode=model_config.embedding_mode,
            preemption_mode=self.preemption_mode,
            num_scheduler_steps=self.num_scheduler_steps,
        )

        lora_config = LoRAConfig(
            max_lora_rank=self.max_lora_rank,
            max_loras=self.max_loras,
            fully_sharded_loras=self.fully_sharded_loras,
            lora_extra_vocab_size=self.lora_extra_vocab_size,
            long_lora_scaling_factors=self.long_lora_scaling_factors,
            lora_dtype=self.lora_dtype,
            max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras
            and self.max_cpu_loras > 0 else None) if self.enable_lora else None

        if self.qlora_adapter_name_or_path is not None and \
            self.qlora_adapter_name_or_path != "":
            if self.model_loader_extra_config is None:
                self.model_loader_extra_config = {}
            self.model_loader_extra_config[
                "qlora_adapter_name_or_path"] = self.qlora_adapter_name_or_path

        load_config = LoadConfig(
            load_format=self.load_format,
            download_dir=self.download_dir,
            model_loader_extra_config=self.model_loader_extra_config,
            ignore_patterns=self.ignore_patterns)

        prompt_adapter_config = PromptAdapterConfig(
            max_prompt_adapters=self.max_prompt_adapters,
            max_prompt_adapter_token=self.max_prompt_adapter_token) \
                                        if self.enable_prompt_adapter else None

        decoding_config = DecodingConfig(
            guided_decoding_backend=self.guided_decoding_backend)

        if (model_config.get_sliding_window() is not None
                and scheduler_config.chunked_prefill_enabled
                and not scheduler_config.use_v2_block_manager):
            raise ValueError(
                "Chunked prefill is not supported with sliding window. "
                "Set --disable-sliding-window to disable sliding window.")

        return EngineConfig(model_config=model_config,
                            cache_config=cache_config,
                            parallel_config=parallel_config,
                            scheduler_config=scheduler_config,
                            device_config=device_config,
                            lora_config=lora_config,
                            speculative_config=speculative_config,
                            load_config=load_config,
                            decoding_config=decoding_config,
                            prompt_adapter_config=prompt_adapter_config)


@dataclass
class AsyncEngineArgs(EngineArgs):
    """Arguments for asynchronous Aphrodite engine."""

    engine_use_ray: bool = False
    disable_log_requests: bool = False
    uvloop: bool = False

    @staticmethod
    def add_cli_args(parser: FlexibleArgumentParser,
                     async_args_only: bool = False) -> FlexibleArgumentParser:
        if not async_args_only:
            parser = EngineArgs.add_cli_args(parser)
        parser.add_argument('--engine-use-ray',
                            action='store_true',
                            help='Use Ray to start the LLM engine in a '
                            'separate process as the server process.')
        parser.add_argument('--disable-log-requests',
                            action='store_true',
                            help='Disable logging requests.')
        parser.add_argument(
            "--uvloop",
            action="store_true",
            help="Use the Uvloop asyncio event loop to possibly increase "
            "performance")
        return parser


class StoreBoolean(argparse.Action):

    def __call__(self, parser, namespace, values, option_string=None):
        if values.lower() == "true":
            setattr(namespace, self.dest, True)
        elif values.lower() == "false":
            setattr(namespace, self.dest, False)
        else:
            raise ValueError(f"Invalid boolean value: {values}. "
                             "Expected 'true' or 'false'.")