|
@@ -0,0 +1,379 @@
|
|
|
+# Sample configuration file for Aphrodite Engine
|
|
|
+# You can launch the engine using a provided config file by running
|
|
|
+# `aphrodite yaml config.yaml` in the CLI
|
|
|
+
|
|
|
+# You can run `aphrodite run -h` to see the full list of options
|
|
|
+# that you can pass to the engine.
|
|
|
+
|
|
|
+# Uncomment and modify the following lines to configure the engine
|
|
|
+
|
|
|
+# The basic options. You will usually need to specify these
|
|
|
+basic_args:
|
|
|
+ # Your model name. Can be a local path or huggingface model ID
|
|
|
+ - model:
|
|
|
+
|
|
|
+ # If you want a custom model name for the API, specify it here
|
|
|
+ - served_model_name:
|
|
|
+
|
|
|
+ # Wether or not to launch the Kobold API server. Used for hosting
|
|
|
+ # on Kobold Horde. Takes a boolean value (true/false)
|
|
|
+ - launch_kobold_api:
|
|
|
+
|
|
|
+ # The maximum sequence length/context window for the model
|
|
|
+ # You can leave this blank to use the default value (recommended)
|
|
|
+ - max_model_len:
|
|
|
+
|
|
|
+ # The tensor parallelism degree. Set this to the number of GPUs you have
|
|
|
+ # Keep in mind that for **quantized** models, this will typically only work
|
|
|
+ # with values between 1, 2, 4, and 8.
|
|
|
+ - tensor_parallel_size:
|
|
|
+
|
|
|
+ # The pipeline parallelism degree. This is similar to tensor parallel,
|
|
|
+ # but splits the layers across GPUs rather than the tensors. Only use this
|
|
|
+ # if you're doing multi-node, or need 3, 5, 6, 7 GPUs for quantized models.
|
|
|
+ - pipeline_parallel_size:
|
|
|
+
|
|
|
+ # The data type to use for KV cache. You can set it to 'fp8' to reduce
|
|
|
+ # memory usage for large contexts.
|
|
|
+ - kv_cache_dtype:
|
|
|
+
|
|
|
+ # Enable chunking the prefill tokens. This greatly reduces memory usage
|
|
|
+ # at high contexts, but it mutually exclusive with kv_cache_dtype=fp8
|
|
|
+ # Takes a boolean value (true/false)
|
|
|
+ - enable_chunked_prefill:
|
|
|
+
|
|
|
+ # By default, Aphrodite Engine reserves 90% of VRAM for every GPU it's using.
|
|
|
+ # Pass a value between 0-1 (e.g. 0.95 for 95%) to increase or decrease this.
|
|
|
+ - gpu_memory_utilization:
|
|
|
+
|
|
|
+ # If your model doesn't fit on the GPU, use this. It takes values in GiB.
|
|
|
+ # e.g., if you pass `10`, it'll virtually add 10 GiB of VRAM to your GPU.
|
|
|
+ # Not recommended because CPU offloading is generally slow.
|
|
|
+ - cpu_offload_gb:
|
|
|
+
|
|
|
+ # This is essentially the maximum batch size. It's set to `256` by default.
|
|
|
+ # You can lower this to use less memory, but it doesn't affect things that much,
|
|
|
+ # unless `enforce_eager` is enabled.
|
|
|
+ - max_num_seqs:
|
|
|
+
|
|
|
+ # Whether to enable CUDA graphs. By default, CUDA graphs are disabled. Pass
|
|
|
+ # `false` here to enable them, and leave blank or pass `true` to keep it disabled.
|
|
|
+ - enforce_eager:
|
|
|
+
|
|
|
+ # The load format to use. You can usually leave this blank.
|
|
|
+ # If you want to use bitsandbytes on-the-fly quantization,
|
|
|
+ # pass `bitsandbytes`, along with `quantization=bitsandbytes`
|
|
|
+ # in the category below.
|
|
|
+ - load_format:
|
|
|
+
|
|
|
+ # Whether or not to enable prefix caching. This will cache
|
|
|
+ # previous prompts so that they're not recomputed. Helps
|
|
|
+ # with large prompts.
|
|
|
+ - enable_prefix_caching:
|
|
|
+
|
|
|
+ # Whether or not to trust remote code in the repository. Needed
|
|
|
+ # for some models that have custom code.
|
|
|
+ - trust_remote_code:
|
|
|
+
|
|
|
+ # The download directory if the `model` is a Hugging Face ID.
|
|
|
+ - download_dir:
|
|
|
+
|
|
|
+ # The data type to use for the model. Can be `auto`, `float16`, `bfloat16`,
|
|
|
+ # `float32`. Defaults to `auto`, which will use fp16 for fp32 and fp16 models,
|
|
|
+ # and bf16 for bf16 models.
|
|
|
+ - dtype:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+# Quantization options.
|
|
|
+quantization_args:
|
|
|
+
|
|
|
+ # The quantization type to use. You don't usually need to pass this,
|
|
|
+ # as the engine will figure out the quant from the model itself.
|
|
|
+ # You may need to use this if you want to perform online quantization,
|
|
|
+ # i.e., quantizing a 16-bit model on-the-fly.
|
|
|
+ # To use FP8 (only supported by Ampere and newer GPUs), pass `fp8`.
|
|
|
+ # To use bitsandbytes, pass `bitsandbytes`.
|
|
|
+ - quantization:
|
|
|
+
|
|
|
+ # Path to the JSON file containing the KV cache scaling factors.
|
|
|
+ # This should generally be supplied when KV cache dtype is FP8.
|
|
|
+ # Otherwise, KV cache scaling factors default to 1.0, which
|
|
|
+ # may cause accuracy issues. FP8_E5M2 (without scaling) is
|
|
|
+ # only supported on CUDA versions greater than 11.8. On ROCm,
|
|
|
+ # FP8_E4M3 is used instead.
|
|
|
+ # For most use cases, you can leave this blank. If you want to
|
|
|
+ # generate scales for your model, look at examples/fp8 directory.
|
|
|
+ - quantization_param_path:
|
|
|
+
|
|
|
+ # The number of floating point bits to use for deepspeed_fp
|
|
|
+ # on-the-fly quantization. Only pass this if you've set
|
|
|
+ # quantization to `deepspeedfp`. Takes 4, 6, 8, 12.
|
|
|
+ - deepspeed_fp_bits:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+# The API-specific options. These are decoupled from the engine.
|
|
|
+api_args:
|
|
|
+
|
|
|
+ # The API key to use for the server. Leave blank to disable API key.
|
|
|
+ - api_keys:
|
|
|
+
|
|
|
+ # The local path or http address to the chat template to use.
|
|
|
+ # This will override the model's existing chat template, if
|
|
|
+ # it has one.
|
|
|
+ - chat_template:
|
|
|
+
|
|
|
+ # When max_logprobs is specified, represents single tokens as
|
|
|
+ # strings of the form `token_ids:{token_id}` so that tokens
|
|
|
+ # that are not JSON-encodable can be identified.
|
|
|
+ - return_tokens_as_token_ids:
|
|
|
+
|
|
|
+
|
|
|
+# These are the options for speculative decoding. Spec Decoding
|
|
|
+# is a way to speed up inference by loading a smaller model
|
|
|
+# and letting it do the predictions, and your main model
|
|
|
+# will only verify its outputs. The outputs will match
|
|
|
+# 1:1 with your main model.
|
|
|
+
|
|
|
+# We currently support the following speculative decoding algorithms:
|
|
|
+# Draft Model, Ngram Prompt Lookup, MLPSpeculator, and Medusa.
|
|
|
+speculative_args:
|
|
|
+
|
|
|
+ # Use the V2 block manager. Mandatory for speculative decoding.
|
|
|
+ # Takes a boolean value (true/false)
|
|
|
+ - use_v2_block_manager:
|
|
|
+
|
|
|
+ # The speculative model to use. Can take either a Hugging Face ID
|
|
|
+ # or a local path. You can also pass "[ngram]" to use ngram prompt
|
|
|
+ # lookup decoding without needing a draft model.
|
|
|
+ - speculative_model:
|
|
|
+
|
|
|
+ # The number of tokens for the speculative model to predict.
|
|
|
+ # Spec decoding can generate multiple tokens in single forward
|
|
|
+ # pass to speed up inference. Don't set this too high, a good
|
|
|
+ # value is between 3-10, depending on model size.
|
|
|
+ - num_speculative_tokens:
|
|
|
+
|
|
|
+ # The tensor parallel size to use for the speculative model.
|
|
|
+ # Usually, you want this set to 1.
|
|
|
+ - speculative_draft_tensor_parallel_size:
|
|
|
+
|
|
|
+ # The maximum window size for ngram prompt lookup
|
|
|
+ # This needs to be set if you're using ngram prompt lookup
|
|
|
+ - ngram_prompt_lookup_max:
|
|
|
+
|
|
|
+ # The minimum window size for ngram prompt lookup
|
|
|
+ - ngram_prompt_lookup_min:
|
|
|
+
|
|
|
+ # Disable speculative decoding if the number of queued
|
|
|
+ # requests is larger than this value. This is useful
|
|
|
+ # to prevent speculative decoding from using too much
|
|
|
+ # compute.
|
|
|
+ - speculative_disable_by_batch_size:
|
|
|
+
|
|
|
+ # The acceptance method to use for speculative decoding.
|
|
|
+ # Can be either `rejection_sampler` or `typical_acceptance_sampler`.
|
|
|
+ # The default is `rejection_sampler`.
|
|
|
+ # Rejection sampler does not allow changing the acceptance rate
|
|
|
+ # of draft tokens. More accurate but slower.
|
|
|
+ # Typical acceptance sampler allows changing the acceptance rate
|
|
|
+ # of draft tokens. Less accurate but faster.
|
|
|
+ - spec_decoding_acceptance_method:
|
|
|
+
|
|
|
+ # The lower bound threshold for the posterior probability
|
|
|
+ # of a token to be accepted. Only set this if you're using
|
|
|
+ # the typical acceptance sampler. Defaults to 0.09.
|
|
|
+ - typical_acceptance_sampler_posterior_threshold:
|
|
|
+
|
|
|
+ # A scaling factor for the entropy-based threshold for token
|
|
|
+ # acceptance in the typical acceptance sampler. Only set this
|
|
|
+ # if you're using the typical acceptance sampler. Defaults to
|
|
|
+ # sqrt of typical_acceptance_sampler_posterior_threshold, i.e. 0.3.
|
|
|
+ - typical_acceptance_sampler_posterior_alpha:
|
|
|
+
|
|
|
+ # Whether to disable logprobs during speculative decoding.
|
|
|
+ # If True, token log probabilities are not returned. If False,
|
|
|
+ # log probabilities are returned according to the settings
|
|
|
+ # in samplingParams. Defaults to True.
|
|
|
+ # Disabling this (setting to True) speeds up inference
|
|
|
+ # during speculative decoding by skipping log probability
|
|
|
+ # calculation in proposal and target sampling.
|
|
|
+ - disable_logprobs_during_spec_decoding:
|
|
|
+
|
|
|
+
|
|
|
+# The config options for LoRA adapters.
|
|
|
+# Each adapter is treated as a separate model in the API server,
|
|
|
+# and your requests will need to be sent to the specific model.
|
|
|
+lora_args:
|
|
|
+
|
|
|
+ # Whether or not to enable handling LoRA adapters.
|
|
|
+ # Takes a boolean value (true/false)
|
|
|
+ - enable_lora:
|
|
|
+
|
|
|
+
|
|
|
+ # The LoRA adapters to use for the API server.
|
|
|
+ # You can specify multiple adapters here.
|
|
|
+ - lora_modules:
|
|
|
+ # Change the name of the adapter to something more descriptive
|
|
|
+ # e.g. ` - my_sql_lora: /path/to/my_sql_lora`
|
|
|
+ - lora1:
|
|
|
+ - lora2:
|
|
|
+
|
|
|
+ # The maximum number of LoRA adapters in a single batch.
|
|
|
+ - max_loras:
|
|
|
+
|
|
|
+ # The maximum rank of the LoRA adapters. We currently support
|
|
|
+ # up to 64.
|
|
|
+ - max_lora_rank:
|
|
|
+
|
|
|
+ # The maximum size of extra vocabulary that can be present
|
|
|
+ # in a LoRA adapter (added to the base model vocab)
|
|
|
+ - lora_extra_vocab_size:
|
|
|
+
|
|
|
+ # The data type for the LoRA adapter.
|
|
|
+ # Can take "auto", "float16", "bfloat16", and "float32"
|
|
|
+ - lora_dtype:
|
|
|
+
|
|
|
+ # The maximum number of LoRA adapters to store in CPU memory.
|
|
|
+ # This number must be larger or equal to max_num_seqs.
|
|
|
+ # Defaults to max_num_seqs.
|
|
|
+ - max_cpu_loras:
|
|
|
+
|
|
|
+ # Specify multiple scaling factors (which can be different from base
|
|
|
+ # model scaling factor) to allow for multiple LoRA adapters trained
|
|
|
+ # with those scaling factors to be used at the same time.
|
|
|
+ # If not specified, only adapters trained with the base model scaling
|
|
|
+ # factor are allowed.
|
|
|
+ - long_lora_scaling_factors:
|
|
|
+
|
|
|
+ # By default, only half of the LoRA computation is sharded with tensor
|
|
|
+ # parallelism. Enabling this will use the fully sharded layers. At high
|
|
|
+ # sequence length, max rank, or tensor parallel size, this is likely faster.
|
|
|
+ - fully_sharded_loras:
|
|
|
+
|
|
|
+ # The name or path of the QLoRA adapter to use.
|
|
|
+ - qlora_adapter_name_or_path:
|
|
|
+
|
|
|
+
|
|
|
+# The config options for the Soft Prompt adapters.
|
|
|
+# Soft prompts are a way to tune prompts for a specific task
|
|
|
+# and load them at a request-level.
|
|
|
+soft_prompt_args:
|
|
|
+
|
|
|
+ # Whether or not to enable handling Soft Prompt adapters.
|
|
|
+ # Takes a boolean value (true/false)
|
|
|
+ - enable_prompt_adapter:
|
|
|
+
|
|
|
+ # The Soft Prompt adapters to use for the API server.
|
|
|
+ # You can specify multiple adapters here.
|
|
|
+ - prompt_adapters:
|
|
|
+ # Change the name of the adapter to something more descriptive
|
|
|
+ # e.g. ` - my_sql_prompt: /path/to/my_sql_prompt`
|
|
|
+ - prompt1:
|
|
|
+ - prompt2:
|
|
|
+
|
|
|
+ # The maximum number of Soft Prompt adapters in a single batch.
|
|
|
+ - max_prompt_adapters:
|
|
|
+
|
|
|
+ # The maximum number of PromptAdapter tokens.
|
|
|
+ - max_prompt_adapter_token:
|
|
|
+
|
|
|
+
|
|
|
+# These are advanced options. You usually don't need to modify these.
|
|
|
+advanced_args:
|
|
|
+
|
|
|
+ # The backend to use for distributed inference. Can be either `ray`
|
|
|
+ # or `mp` (multiprocessing). Defaults to `mp` for single-node,
|
|
|
+ # `ray` for multi-node.
|
|
|
+ # Note that specifying a custom backend by passing a custom class
|
|
|
+ # is intended for expert use only. The API may change without notice.
|
|
|
+ - distributed_executor_backend:
|
|
|
+
|
|
|
+ # The tokenizer to use. Defaults to the model's tokenizer.
|
|
|
+ - tokenizer:
|
|
|
+
|
|
|
+ # The model revision to use if pulling from HF. Defaults to main.
|
|
|
+ - revision:
|
|
|
+
|
|
|
+ # The revision for the remote code in the model repository.
|
|
|
+ - code_revision:
|
|
|
+
|
|
|
+ # The revision for the tokenizer.
|
|
|
+ - tokenizer_revision:
|
|
|
+
|
|
|
+ # The maximum number of tokens to be captured by CUDA graphs.
|
|
|
+ # This is set to 8192 by default. If your prompt exceeds this
|
|
|
+ # threshold, it'll fallback to eager execution.
|
|
|
+ - max_seq_len_to_capture:
|
|
|
+
|
|
|
+ # RoPE scaling config in JSON format.
|
|
|
+ # For example, `{"type": "dynamic", "factor": 2.0}`
|
|
|
+ - rope_scaling:
|
|
|
+
|
|
|
+ # The RoPE theta value. Use with `rope_scaling`. In some cases,
|
|
|
+ # changing the RoPE theta improves performance of the scaled
|
|
|
+ # model.
|
|
|
+ - rope_theta:
|
|
|
+
|
|
|
+ # Extra config for the model loader.
|
|
|
+ # This will be passed to the model loader corresponding
|
|
|
+ # to the chosen load_format. This should be a JSON string that
|
|
|
+ # will be parsed into a dictionary.
|
|
|
+ - model_loader_extra_config:
|
|
|
+
|
|
|
+ # Whether to skip tokenizer and detokenizer initialization.
|
|
|
+ - skip_tokenizer_init:
|
|
|
+
|
|
|
+ # The size of tokenizer pool to use for asynchronous tokenization.
|
|
|
+ # IF 0, will use synchronous tokenization.
|
|
|
+ - tokenizer_pool_size:
|
|
|
+
|
|
|
+ # The type of tokenizer pool to use for asynchronous tokenization.
|
|
|
+ # Ignored if tokenizer_pool_size is 0.
|
|
|
+ # Note that specifying a tokenizer pool by passing a custom class
|
|
|
+ # is intended for expert use only. The API may change without notice.
|
|
|
+ - tokenizer_pool_type:
|
|
|
+
|
|
|
+ # The extra config for tokenizer pool. This should be a JSON string
|
|
|
+ # that will be parsed into a dictionary. Ignored if tokenizer_pool_size
|
|
|
+ # is 0.
|
|
|
+ - tokenizer_pool_extra_config:
|
|
|
+
|
|
|
+ # The maximum log probabilities to return in the API. Defaults to 10.
|
|
|
+ - max_logprobs:
|
|
|
+
|
|
|
+ # The device to use for model execution. You usually don't
|
|
|
+ # need to modify this.
|
|
|
+ # We support `auto`, `cuda`, `neuron`, `cpu`, `openvino`, `tpu`, and `xpu.
|
|
|
+ - device:
|
|
|
+
|
|
|
+ # The pattern(s) to ignore when loading the model.
|
|
|
+ # Defaults to `original/**/*` to avoid repeated loading
|
|
|
+ # of llama's checkpoints.
|
|
|
+ - ignore_patterns:
|
|
|
+
|
|
|
+ # If specified, use nsight to profile ray workers.
|
|
|
+ - ray_workers_use_nsight:
|
|
|
+
|
|
|
+ # If specified, disable the custom all-reduce kernels.
|
|
|
+ # They're enabled by default for GPUs with P2P support.
|
|
|
+ - disable_custom_all_reduce:
|
|
|
+
|
|
|
+ # The preemption mode to use for the scheduler. If `recompute`,
|
|
|
+ # the engine performs preemption by block recomputation. If `swap`,
|
|
|
+ # the engine performs preemption by block swapping.
|
|
|
+ - preemption_mode:
|
|
|
+
|
|
|
+ # If specified, ignore GPU profiling result and use this
|
|
|
+ # number of GPU blocks. Only used for testing.
|
|
|
+ - num_gpu_blocks_override:
|
|
|
+
|
|
|
+ # The CPU swap space size (GiB) per GPU. Not related to CPU offloading.
|
|
|
+ - swap_space:
|
|
|
+
|
|
|
+ # Whether to disable sliding window.
|
|
|
+ - disable_sliding_window:
|
|
|
+
|
|
|
+ # The token block size. Takes values between 8, 16, 32.
|
|
|
+ - block_size:
|