6 meses atrás · 9fcf331f1b
--- a/aphrodite/endpoints/cli.py
+++ b/aphrodite/endpoints/cli.py
@@ -3,9 +3,11 @@ import argparse
 
				 import asyncio
			
 
				 import os
			
 
				 import signal
			
 
				+import subprocess
			
 
				 import sys
			
 
				 from typing import Optional
			
 
				 
			
 
				+import yaml
			
 
				 from openai import OpenAI
			
 
				 
			
 
				 from aphrodite.common.utils import FlexibleArgumentParser
			
@@ -83,6 +85,50 @@ def chat(system_prompt: Optional[str], model_name: str,
 
				         print(output)
			
 
				 
			
 
				 
			
 
				+STR_BOOLS = ['enforce_eager', 'enable_chunked_prefill']
			
 
				+ADAPTERS = ['lora_modules', 'prompt_adapters']
			
 
				+
			
 
				+
			
 
				+# TODO: refactor this to directly call run_server with the config file
			
 
				+def serve_yaml(args: argparse.Namespace) -> None:
			
 
				+
			
 
				+    def append_cmd_args(cmd, key, value):
			
 
				+        if value:  # Skip appending if value is empty
			
 
				+            if key in ADAPTERS and isinstance(value, list):
			
 
				+                adapters = [f"{k}={v}" for k, v in value[0].items() if v]
			
 
				+                if adapters:
			
 
				+                    cmd.append(f"--{key}")
			
 
				+                    cmd.extend(adapters)
			
 
				+            else:
			
 
				+                cmd.append(f"--{key}")
			
 
				+                if isinstance(value, bool):
			
 
				+                    if key in STR_BOOLS:
			
 
				+                        cmd.append(str(value).lower())
			
 
				+                    elif value:
			
 
				+                        cmd.append(str(value))
			
 
				+                else:
			
 
				+                    cmd.append(str(value))
			
 
				+
			
 
				+    with open(args.config_file, 'r') as f:
			
 
				+        config = yaml.safe_load(f)
			
 
				+
			
 
				+    cmd = ["python", "-m", "aphrodite.endpoints.openai.api_server"]
			
 
				+    for key, value in config.items():
			
 
				+        if isinstance(value, list):
			
 
				+            for item in value:
			
 
				+                for sub_key, sub_value in item.items():
			
 
				+                    append_cmd_args(cmd, sub_key, sub_value)
			
 
				+        else:
			
 
				+            append_cmd_args(cmd, key, value)
			
 
				+
			
 
				+    process = subprocess.Popen(cmd)
			
 
				+    try:
			
 
				+        process.wait()
			
 
				+    except KeyboardInterrupt:
			
 
				+        process.terminate()
			
 
				+        process.wait()
			
 
				+
			
 
				+
			
 
				 def _add_query_options(
			
 
				         parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
			
 
				     parser.add_argument(
			
@@ -143,6 +189,16 @@ def main():
 
				               "used for models that support system prompts."))
			
 
				     chat_parser.set_defaults(dispatch_function=interactive_cli, command="chat")
			
 
				 
			
 
				+    yaml_parser = subparsers.add_parser(
			
 
				+        "yaml",
			
 
				+        help="Start the Aphrodite OpenAI Compatible API server with a YAML "
			
 
				+        "config file",
			
 
				+        usage="aphrodite yaml <config.yaml>")
			
 
				+    yaml_parser.add_argument("config_file",
			
 
				+                             type=str,
			
 
				+                             help="The YAML configuration file to use")
			
 
				+    yaml_parser.set_defaults(dispatch_function=serve_yaml)
			
 
				+
			
 
				     args = parser.parse_args()
			
 
				     # One of the sub commands should be executed.
			
 
				     if hasattr(args, "dispatch_function"):
			
--- a/config.yaml
+++ b/config.yaml
@@ -0,0 +1,379 @@
 
				+# Sample configuration file for Aphrodite Engine
			
 
				+# You can launch the engine using a provided config file by running
			
 
				+# `aphrodite yaml config.yaml` in the CLI
			
 
				+
			
 
				+# You can run `aphrodite run -h` to see the full list of options
			
 
				+# that you can pass to the engine.
			
 
				+
			
 
				+# Uncomment and modify the following lines to configure the engine
			
 
				+
			
 
				+# The basic options. You will usually need to specify these
			
 
				+basic_args:
			
 
				+  # Your model name. Can be a local path or huggingface model ID
			
 
				+  - model:
			
 
				+
			
 
				+  # If you want a custom model name for the API, specify it here
			
 
				+  - served_model_name:
			
 
				+
			
 
				+  # Wether or not to launch the Kobold API server. Used for hosting
			
 
				+  # on Kobold Horde. Takes a boolean value (true/false)
			
 
				+  - launch_kobold_api:
			
 
				+
			
 
				+  # The maximum sequence length/context window for the model
			
 
				+  # You can leave this blank to use the default value (recommended)
			
 
				+  - max_model_len:
			
 
				+
			
 
				+  # The tensor parallelism degree. Set this to the number of GPUs you have
			
 
				+  # Keep in mind that for **quantized** models, this will typically only work
			
 
				+  # with values between 1, 2, 4, and 8.
			
 
				+  - tensor_parallel_size:
			
 
				+
			
 
				+  # The pipeline parallelism degree. This is similar to tensor parallel,
			
 
				+  # but splits the layers across GPUs rather than the tensors. Only use this
			
 
				+  # if you're doing multi-node, or need 3, 5, 6, 7 GPUs for quantized models.
			
 
				+  - pipeline_parallel_size:
			
 
				+
			
 
				+  # The data type to use for KV cache. You can set it to 'fp8' to reduce
			
 
				+  # memory usage for large contexts.
			
 
				+  - kv_cache_dtype:
			
 
				+
			
 
				+  # Enable chunking the prefill tokens. This greatly reduces memory usage
			
 
				+  # at high contexts, but it mutually exclusive with kv_cache_dtype=fp8
			
 
				+  # Takes a boolean value (true/false)
			
 
				+  - enable_chunked_prefill:
			
 
				+
			
 
				+  # By default, Aphrodite Engine reserves 90% of VRAM for every GPU it's using.
			
 
				+  # Pass a value between 0-1 (e.g. 0.95 for 95%) to increase or decrease this.
			
 
				+  - gpu_memory_utilization:
			
 
				+
			
 
				+  # If your model doesn't fit on the GPU, use this. It takes values in GiB.
			
 
				+  # e.g., if you pass `10`, it'll virtually add 10 GiB of VRAM to your GPU.
			
 
				+  # Not recommended because CPU offloading is generally slow.
			
 
				+  - cpu_offload_gb:
			
 
				+
			
 
				+  # This is essentially the maximum batch size. It's set to `256` by default.
			
 
				+  # You can lower this to use less memory, but it doesn't affect things that much,
			
 
				+  # unless `enforce_eager` is enabled.
			
 
				+  - max_num_seqs:
			
 
				+
			
 
				+  # Whether to enable CUDA graphs. By default, CUDA graphs are disabled. Pass
			
 
				+  # `false` here to enable them, and leave blank or pass `true` to keep it disabled.
			
 
				+  - enforce_eager:
			
 
				+
			
 
				+  # The load format to use. You can usually leave this blank.
			
 
				+  # If you want to use bitsandbytes on-the-fly quantization,
			
 
				+  # pass `bitsandbytes`, along with `quantization=bitsandbytes`
			
 
				+  # in the category below.
			
 
				+  - load_format:
			
 
				+
			
 
				+  # Whether or not to enable prefix caching. This will cache
			
 
				+  # previous prompts so that they're not recomputed. Helps
			
 
				+  # with large prompts.
			
 
				+  - enable_prefix_caching:
			
 
				+
			
 
				+  # Whether or not to trust remote code in the repository. Needed
			
 
				+  # for some models that have custom code.
			
 
				+  - trust_remote_code:
			
 
				+
			
 
				+  # The download directory if the `model` is a Hugging Face ID.
			
 
				+  - download_dir:
			
 
				+
			
 
				+  # The data type to use for the model. Can be `auto`, `float16`, `bfloat16`,
			
 
				+  # `float32`. Defaults to `auto`, which will use fp16 for fp32 and fp16 models,
			
 
				+  # and bf16 for bf16 models.
			
 
				+  - dtype:
			
 
				+
			
 
				+
			
 
				+
			
 
				+# Quantization options.
			
 
				+quantization_args:
			
 
				+
			
 
				+  # The quantization type to use. You don't usually need to pass this,
			
 
				+  # as the engine will figure out the quant from the model itself.
			
 
				+  # You may need to use this if you want to perform online quantization,
			
 
				+  # i.e., quantizing a 16-bit model on-the-fly.
			
 
				+  # To use FP8 (only supported by Ampere and newer GPUs), pass `fp8`.
			
 
				+  # To use bitsandbytes, pass `bitsandbytes`.
			
 
				+  - quantization:
			
 
				+
			
 
				+  # Path to the JSON file containing the KV cache scaling factors.
			
 
				+  # This should generally be supplied when KV cache dtype is FP8.
			
 
				+  # Otherwise, KV cache scaling factors default to 1.0, which
			
 
				+  # may cause accuracy issues. FP8_E5M2 (without scaling) is
			
 
				+  # only supported on CUDA versions greater than 11.8. On ROCm,
			
 
				+  # FP8_E4M3 is used instead.
			
 
				+  # For most use cases, you can leave this blank. If you want to
			
 
				+  # generate scales for your model, look at examples/fp8 directory.
			
 
				+  - quantization_param_path:
			
 
				+
			
 
				+  # The number of floating point bits to use for deepspeed_fp
			
 
				+  # on-the-fly quantization. Only pass this if you've set
			
 
				+  # quantization to `deepspeedfp`. Takes 4, 6, 8, 12.
			
 
				+  - deepspeed_fp_bits:
			
 
				+
			
 
				+
			
 
				+
			
 
				+# The API-specific options. These are decoupled from the engine.
			
 
				+api_args:
			
 
				+
			
 
				+  # The API key to use for the server. Leave blank to disable API key.
			
 
				+  - api_keys:
			
 
				+
			
 
				+  # The local path or http address to the chat template to use.
			
 
				+  # This will override the model's existing chat template, if
			
 
				+  # it has one.
			
 
				+  - chat_template:
			
 
				+
			
 
				+  # When max_logprobs is specified, represents single tokens as
			
 
				+  # strings of the form `token_ids:{token_id}` so that tokens
			
 
				+  # that are not JSON-encodable can be identified.
			
 
				+  - return_tokens_as_token_ids:
			
 
				+
			
 
				+
			
 
				+# These are the options for speculative decoding. Spec Decoding
			
 
				+# is a way to speed up inference by loading a smaller model
			
 
				+# and letting it do the predictions, and your main model
			
 
				+# will only verify its outputs. The outputs will match
			
 
				+# 1:1 with your main model.
			
 
				+
			
 
				+# We currently support the following speculative decoding algorithms:
			
 
				+# Draft Model, Ngram Prompt Lookup, MLPSpeculator, and Medusa.
			
 
				+speculative_args:
			
 
				+  
			
 
				+  # Use the V2 block manager. Mandatory for speculative decoding.
			
 
				+  # Takes a boolean value (true/false)
			
 
				+  - use_v2_block_manager:
			
 
				+
			
 
				+  # The speculative model to use. Can take either a Hugging Face ID
			
 
				+  # or a local path. You can also pass "[ngram]" to use ngram prompt
			
 
				+  # lookup decoding without needing a draft model.
			
 
				+  - speculative_model:
			
 
				+
			
 
				+  # The number of tokens for the speculative model to predict.
			
 
				+  # Spec decoding can generate multiple tokens in single forward
			
 
				+  # pass to speed up inference. Don't set this too high, a good
			
 
				+  # value is between 3-10, depending on model size.
			
 
				+  - num_speculative_tokens:
			
 
				+
			
 
				+  # The tensor parallel size to use for the speculative model.
			
 
				+  # Usually, you want this set to 1.
			
 
				+  - speculative_draft_tensor_parallel_size:
			
 
				+
			
 
				+  # The maximum window size for ngram prompt lookup
			
 
				+  # This needs to be set if you're using ngram prompt lookup
			
 
				+  - ngram_prompt_lookup_max:
			
 
				+
			
 
				+  # The minimum window size for ngram prompt lookup
			
 
				+  - ngram_prompt_lookup_min:
			
 
				+
			
 
				+  # Disable speculative decoding if the number of queued
			
 
				+  # requests is larger than this value. This is useful
			
 
				+  # to prevent speculative decoding from using too much
			
 
				+  # compute.
			
 
				+  - speculative_disable_by_batch_size:
			
 
				+
			
 
				+  # The acceptance method to use for speculative decoding.
			
 
				+  # Can be either `rejection_sampler` or `typical_acceptance_sampler`.
			
 
				+  # The default is `rejection_sampler`.
			
 
				+  # Rejection sampler does not allow changing the acceptance rate
			
 
				+  # of draft tokens. More accurate but slower.
			
 
				+  # Typical acceptance sampler allows changing the acceptance rate
			
 
				+  # of draft tokens. Less accurate but faster.
			
 
				+  - spec_decoding_acceptance_method:
			
 
				+
			
 
				+  # The lower bound threshold for the posterior probability
			
 
				+  # of a token to be accepted. Only set this if you're using
			
 
				+  # the typical acceptance sampler. Defaults to 0.09.
			
 
				+  - typical_acceptance_sampler_posterior_threshold:
			
 
				+
			
 
				+  # A scaling factor for the entropy-based threshold for token
			
 
				+  # acceptance in the typical acceptance sampler. Only set this
			
 
				+  # if you're using the typical acceptance sampler. Defaults to
			
 
				+  # sqrt of typical_acceptance_sampler_posterior_threshold, i.e. 0.3.
			
 
				+  - typical_acceptance_sampler_posterior_alpha:
			
 
				+
			
 
				+  # Whether to disable logprobs during speculative decoding.
			
 
				+  # If True, token log probabilities are not returned. If False,
			
 
				+  # log probabilities are returned according to the settings
			
 
				+  # in samplingParams. Defaults to True.
			
 
				+  # Disabling this (setting to True) speeds up inference
			
 
				+  # during speculative decoding by skipping log probability
			
 
				+  # calculation in proposal and target sampling.
			
 
				+  - disable_logprobs_during_spec_decoding:
			
 
				+
			
 
				+
			
 
				+# The config options for LoRA adapters.
			
 
				+# Each adapter is treated as a separate model in the API server,
			
 
				+# and your requests will need to be sent to the specific model.
			
 
				+lora_args:
			
 
				+
			
 
				+  # Whether or not to enable handling LoRA adapters.
			
 
				+  # Takes a boolean value (true/false)
			
 
				+  - enable_lora:
			
 
				+
			
 
				+
			
 
				+  # The LoRA adapters to use for the API server.
			
 
				+  # You can specify multiple adapters here.
			
 
				+  - lora_modules:
			
 
				+    # Change the name of the adapter to something more descriptive
			
 
				+    # e.g. ` - my_sql_lora: /path/to/my_sql_lora`
			
 
				+    - lora1: 
			
 
				+    - lora2:
			
 
				+
			
 
				+  # The maximum number of LoRA adapters in a single batch.
			
 
				+  - max_loras:
			
 
				+
			
 
				+  # The maximum rank of the LoRA adapters. We currently support
			
 
				+  # up to 64.
			
 
				+  - max_lora_rank:
			
 
				+
			
 
				+  # The maximum size of extra vocabulary that can be present
			
 
				+  # in a LoRA adapter (added to the base model vocab)
			
 
				+  - lora_extra_vocab_size:
			
 
				+
			
 
				+  # The data type for the LoRA adapter.
			
 
				+  # Can take "auto", "float16", "bfloat16", and "float32"
			
 
				+  - lora_dtype:
			
 
				+
			
 
				+  # The maximum number of LoRA adapters to store in CPU memory.
			
 
				+  # This number must be larger or equal to max_num_seqs.
			
 
				+  # Defaults to max_num_seqs.
			
 
				+  - max_cpu_loras:
			
 
				+
			
 
				+  # Specify multiple scaling factors (which can be different from base
			
 
				+  # model scaling factor) to allow for multiple LoRA adapters trained
			
 
				+  # with those scaling factors to be used at the same time.
			
 
				+  # If not specified, only adapters trained with the base model scaling
			
 
				+  # factor are allowed.
			
 
				+  - long_lora_scaling_factors:
			
 
				+
			
 
				+  # By default, only half of the LoRA computation is sharded with tensor
			
 
				+  # parallelism. Enabling this will use the fully sharded layers. At high
			
 
				+  # sequence length, max rank, or tensor parallel size, this is likely faster.
			
 
				+  - fully_sharded_loras:
			
 
				+
			
 
				+  # The name or path of the QLoRA adapter to use.
			
 
				+  - qlora_adapter_name_or_path:
			
 
				+
			
 
				+
			
 
				+# The config options for the Soft Prompt adapters.
			
 
				+# Soft prompts are a way to tune prompts for a specific task
			
 
				+# and load them at a request-level.
			
 
				+soft_prompt_args:
			
 
				+
			
 
				+  # Whether or not to enable handling Soft Prompt adapters.
			
 
				+  # Takes a boolean value (true/false)
			
 
				+  - enable_prompt_adapter:
			
 
				+
			
 
				+  # The Soft Prompt adapters to use for the API server.
			
 
				+  # You can specify multiple adapters here.
			
 
				+  - prompt_adapters:
			
 
				+    # Change the name of the adapter to something more descriptive
			
 
				+    # e.g. ` - my_sql_prompt: /path/to/my_sql_prompt`
			
 
				+    - prompt1: 
			
 
				+    - prompt2:
			
 
				+
			
 
				+  # The maximum number of Soft Prompt adapters in a single batch.
			
 
				+  - max_prompt_adapters:
			
 
				+
			
 
				+  # The maximum number of PromptAdapter tokens.
			
 
				+  - max_prompt_adapter_token:
			
 
				+
			
 
				+
			
 
				+# These are advanced options. You usually don't need to modify these.
			
 
				+advanced_args:
			
 
				+
			
 
				+  # The backend to use for distributed inference. Can be either `ray`
			
 
				+  # or `mp` (multiprocessing). Defaults to `mp` for single-node,
			
 
				+  # `ray` for multi-node.
			
 
				+  # Note that specifying a custom backend by passing a custom class
			
 
				+  # is intended for expert use only. The API may change without notice.
			
 
				+  - distributed_executor_backend:
			
 
				+
			
 
				+  # The tokenizer to use. Defaults to the model's tokenizer.
			
 
				+  - tokenizer:
			
 
				+
			
 
				+  # The model revision to use if pulling from HF. Defaults to main.
			
 
				+  - revision:
			
 
				+
			
 
				+  # The revision for the remote code in the model repository.
			
 
				+  - code_revision:
			
 
				+
			
 
				+  # The revision for the tokenizer.
			
 
				+  - tokenizer_revision:
			
 
				+
			
 
				+  # The maximum number of tokens to be captured by CUDA graphs.
			
 
				+  # This is set to 8192 by default. If your prompt exceeds this
			
 
				+  # threshold, it'll fallback to eager execution.
			
 
				+  - max_seq_len_to_capture:
			
 
				+
			
 
				+  # RoPE scaling config in JSON format.
			
 
				+  # For example, `{"type": "dynamic", "factor": 2.0}`
			
 
				+  - rope_scaling:
			
 
				+
			
 
				+  # The RoPE theta value. Use with `rope_scaling`. In some cases,
			
 
				+  # changing the RoPE theta improves performance of the scaled
			
 
				+  # model.
			
 
				+  - rope_theta:
			
 
				+
			
 
				+  # Extra config for the model loader.
			
 
				+  # This will be passed to the model loader corresponding
			
 
				+  # to the chosen load_format. This should be a JSON string that
			
 
				+  # will be parsed into a dictionary.
			
 
				+  - model_loader_extra_config:
			
 
				+
			
 
				+  # Whether to skip tokenizer and detokenizer initialization.
			
 
				+  - skip_tokenizer_init:
			
 
				+
			
 
				+  # The size of tokenizer pool to use for asynchronous tokenization.
			
 
				+  # IF 0, will use synchronous tokenization.
			
 
				+  - tokenizer_pool_size:
			
 
				+
			
 
				+  # The type of tokenizer pool to use for asynchronous tokenization.
			
 
				+  # Ignored if tokenizer_pool_size is 0.
			
 
				+  # Note that specifying a tokenizer pool by passing a custom class
			
 
				+  # is intended for expert use only. The API may change without notice.
			
 
				+  - tokenizer_pool_type:
			
 
				+
			
 
				+  # The extra config for tokenizer pool. This should be a JSON string
			
 
				+  # that will be parsed into a dictionary. Ignored if tokenizer_pool_size
			
 
				+  # is 0.
			
 
				+  - tokenizer_pool_extra_config:
			
 
				+
			
 
				+  # The maximum log probabilities to return in the API. Defaults to 10.
			
 
				+  - max_logprobs:
			
 
				+
			
 
				+  # The device to use for model execution. You usually don't
			
 
				+  # need to modify this.
			
 
				+  # We support `auto`, `cuda`, `neuron`, `cpu`, `openvino`, `tpu`, and `xpu.
			
 
				+  - device:
			
 
				+
			
 
				+  # The pattern(s) to ignore when loading the model.
			
 
				+  # Defaults to `original/**/*` to avoid repeated loading
			
 
				+  # of llama's checkpoints.
			
 
				+  - ignore_patterns:
			
 
				+
			
 
				+  # If specified, use nsight to profile ray workers.
			
 
				+  - ray_workers_use_nsight:
			
 
				+
			
 
				+  # If specified, disable the custom all-reduce kernels.
			
 
				+  # They're enabled by default for GPUs with P2P support.
			
 
				+  - disable_custom_all_reduce:
			
 
				+
			
 
				+  # The preemption mode to use for the scheduler. If `recompute`,
			
 
				+  # the engine performs preemption by block recomputation. If `swap`,
			
 
				+  # the engine performs preemption by block swapping.
			
 
				+  - preemption_mode:
			
 
				+
			
 
				+  # If specified, ignore GPU profiling result and use this
			
 
				+  # number of GPU blocks. Only used for testing.
			
 
				+  - num_gpu_blocks_override:
			
 
				+
			
 
				+  # The CPU swap space size (GiB) per GPU. Not related to CPU offloading.
			
 
				+  - swap_space:
			
 
				+
			
 
				+  # Whether to disable sliding window.
			
 
				+  - disable_sliding_window:
			
 
				+
			
 
				+  # The token block size. Takes values between 8, 16, 32.
			
 
				+  - block_size: