|
@@ -0,0 +1,379 @@
|
|
|
+# Sample configuration file for Aphrodite Engine
|
|
|
+# You can launch the engine using a provided config file by running
|
|
|
+# `aphrodite yaml config.yaml` in the CLI
|
|
|
+
|
|
|
+# You can run `aphrodite run -h` to see the full list of options
|
|
|
+# that you can pass to the engine.
|
|
|
+
|
|
|
+# Uncomment and modify the following lines to configure the engine
|
|
|
+
|
|
|
+# The basic options. You will usually need to specify these
|
|
|
+basic_args:
|
|
|
+
|
|
|
+ - model:
|
|
|
+
|
|
|
+
|
|
|
+ - served_model_name:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - launch_kobold_api:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - max_model_len:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - tensor_parallel_size:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - pipeline_parallel_size:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - kv_cache_dtype:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - enable_chunked_prefill:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - gpu_memory_utilization:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - cpu_offload_gb:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - max_num_seqs:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - enforce_eager:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - load_format:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - enable_prefix_caching:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - trust_remote_code:
|
|
|
+
|
|
|
+
|
|
|
+ - download_dir:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - dtype:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+# Quantization options.
|
|
|
+quantization_args:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - quantization:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - quantization_param_path:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - deepspeed_fp_bits:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+# The API-specific options. These are decoupled from the engine.
|
|
|
+api_args:
|
|
|
+
|
|
|
+
|
|
|
+ - api_keys:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - chat_template:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - return_tokens_as_token_ids:
|
|
|
+
|
|
|
+
|
|
|
+# These are the options for speculative decoding. Spec Decoding
|
|
|
+# is a way to speed up inference by loading a smaller model
|
|
|
+# and letting it do the predictions, and your main model
|
|
|
+# will only verify its outputs. The outputs will match
|
|
|
+# 1:1 with your main model.
|
|
|
+
|
|
|
+# We currently support the following speculative decoding algorithms:
|
|
|
+# Draft Model, Ngram Prompt Lookup, MLPSpeculator, and Medusa.
|
|
|
+speculative_args:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - use_v2_block_manager:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - speculative_model:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - num_speculative_tokens:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - speculative_draft_tensor_parallel_size:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - ngram_prompt_lookup_max:
|
|
|
+
|
|
|
+
|
|
|
+ - ngram_prompt_lookup_min:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - speculative_disable_by_batch_size:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - spec_decoding_acceptance_method:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - typical_acceptance_sampler_posterior_threshold:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - typical_acceptance_sampler_posterior_alpha:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - disable_logprobs_during_spec_decoding:
|
|
|
+
|
|
|
+
|
|
|
+# The config options for LoRA adapters.
|
|
|
+# Each adapter is treated as a separate model in the API server,
|
|
|
+# and your requests will need to be sent to the specific model.
|
|
|
+lora_args:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - enable_lora:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - lora_modules:
|
|
|
+
|
|
|
+
|
|
|
+ - lora1:
|
|
|
+ - lora2:
|
|
|
+
|
|
|
+
|
|
|
+ - max_loras:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - max_lora_rank:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - lora_extra_vocab_size:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - lora_dtype:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - max_cpu_loras:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - long_lora_scaling_factors:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - fully_sharded_loras:
|
|
|
+
|
|
|
+
|
|
|
+ - qlora_adapter_name_or_path:
|
|
|
+
|
|
|
+
|
|
|
+# The config options for the Soft Prompt adapters.
|
|
|
+# Soft prompts are a way to tune prompts for a specific task
|
|
|
+# and load them at a request-level.
|
|
|
+soft_prompt_args:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - enable_prompt_adapter:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - prompt_adapters:
|
|
|
+
|
|
|
+
|
|
|
+ - prompt1:
|
|
|
+ - prompt2:
|
|
|
+
|
|
|
+
|
|
|
+ - max_prompt_adapters:
|
|
|
+
|
|
|
+
|
|
|
+ - max_prompt_adapter_token:
|
|
|
+
|
|
|
+
|
|
|
+# These are advanced options. You usually don't need to modify these.
|
|
|
+advanced_args:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - distributed_executor_backend:
|
|
|
+
|
|
|
+
|
|
|
+ - tokenizer:
|
|
|
+
|
|
|
+
|
|
|
+ - revision:
|
|
|
+
|
|
|
+
|
|
|
+ - code_revision:
|
|
|
+
|
|
|
+
|
|
|
+ - tokenizer_revision:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - max_seq_len_to_capture:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - rope_scaling:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - rope_theta:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - model_loader_extra_config:
|
|
|
+
|
|
|
+
|
|
|
+ - skip_tokenizer_init:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - tokenizer_pool_size:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - tokenizer_pool_type:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - tokenizer_pool_extra_config:
|
|
|
+
|
|
|
+
|
|
|
+ - max_logprobs:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - device:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - ignore_patterns:
|
|
|
+
|
|
|
+
|
|
|
+ - ray_workers_use_nsight:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - disable_custom_all_reduce:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - preemption_mode:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ - num_gpu_blocks_override:
|
|
|
+
|
|
|
+
|
|
|
+ - swap_space:
|
|
|
+
|
|
|
+
|
|
|
+ - disable_sliding_window:
|
|
|
+
|
|
|
+
|
|
|
+ - block_size:
|