Ver código fonte

feat: add yaml config parsing (#610)

* feat: add yaml config parsing

* fix: prompt adapters
AlpinDale 6 meses atrás
pai
commit
9fcf331f1b
2 arquivos alterados com 435 adições e 0 exclusões
  1. 56 0
      aphrodite/endpoints/cli.py
  2. 379 0
      config.yaml

+ 56 - 0
aphrodite/endpoints/cli.py

@@ -3,9 +3,11 @@ import argparse
 import asyncio
 import os
 import signal
+import subprocess
 import sys
 from typing import Optional
 
+import yaml
 from openai import OpenAI
 
 from aphrodite.common.utils import FlexibleArgumentParser
@@ -83,6 +85,50 @@ def chat(system_prompt: Optional[str], model_name: str,
         print(output)
 
 
+STR_BOOLS = ['enforce_eager', 'enable_chunked_prefill']
+ADAPTERS = ['lora_modules', 'prompt_adapters']
+
+
+# TODO: refactor this to directly call run_server with the config file
+def serve_yaml(args: argparse.Namespace) -> None:
+
+    def append_cmd_args(cmd, key, value):
+        if value:  # Skip appending if value is empty
+            if key in ADAPTERS and isinstance(value, list):
+                adapters = [f"{k}={v}" for k, v in value[0].items() if v]
+                if adapters:
+                    cmd.append(f"--{key}")
+                    cmd.extend(adapters)
+            else:
+                cmd.append(f"--{key}")
+                if isinstance(value, bool):
+                    if key in STR_BOOLS:
+                        cmd.append(str(value).lower())
+                    elif value:
+                        cmd.append(str(value))
+                else:
+                    cmd.append(str(value))
+
+    with open(args.config_file, 'r') as f:
+        config = yaml.safe_load(f)
+
+    cmd = ["python", "-m", "aphrodite.endpoints.openai.api_server"]
+    for key, value in config.items():
+        if isinstance(value, list):
+            for item in value:
+                for sub_key, sub_value in item.items():
+                    append_cmd_args(cmd, sub_key, sub_value)
+        else:
+            append_cmd_args(cmd, key, value)
+
+    process = subprocess.Popen(cmd)
+    try:
+        process.wait()
+    except KeyboardInterrupt:
+        process.terminate()
+        process.wait()
+
+
 def _add_query_options(
         parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
     parser.add_argument(
@@ -143,6 +189,16 @@ def main():
               "used for models that support system prompts."))
     chat_parser.set_defaults(dispatch_function=interactive_cli, command="chat")
 
+    yaml_parser = subparsers.add_parser(
+        "yaml",
+        help="Start the Aphrodite OpenAI Compatible API server with a YAML "
+        "config file",
+        usage="aphrodite yaml <config.yaml>")
+    yaml_parser.add_argument("config_file",
+                             type=str,
+                             help="The YAML configuration file to use")
+    yaml_parser.set_defaults(dispatch_function=serve_yaml)
+
     args = parser.parse_args()
     # One of the sub commands should be executed.
     if hasattr(args, "dispatch_function"):

+ 379 - 0
config.yaml

@@ -0,0 +1,379 @@
+# Sample configuration file for Aphrodite Engine
+# You can launch the engine using a provided config file by running
+# `aphrodite yaml config.yaml` in the CLI
+
+# You can run `aphrodite run -h` to see the full list of options
+# that you can pass to the engine.
+
+# Uncomment and modify the following lines to configure the engine
+
+# The basic options. You will usually need to specify these
+basic_args:
+  # Your model name. Can be a local path or huggingface model ID
+  - model:
+
+  # If you want a custom model name for the API, specify it here
+  - served_model_name:
+
+  # Wether or not to launch the Kobold API server. Used for hosting
+  # on Kobold Horde. Takes a boolean value (true/false)
+  - launch_kobold_api:
+
+  # The maximum sequence length/context window for the model
+  # You can leave this blank to use the default value (recommended)
+  - max_model_len:
+
+  # The tensor parallelism degree. Set this to the number of GPUs you have
+  # Keep in mind that for **quantized** models, this will typically only work
+  # with values between 1, 2, 4, and 8.
+  - tensor_parallel_size:
+
+  # The pipeline parallelism degree. This is similar to tensor parallel,
+  # but splits the layers across GPUs rather than the tensors. Only use this
+  # if you're doing multi-node, or need 3, 5, 6, 7 GPUs for quantized models.
+  - pipeline_parallel_size:
+
+  # The data type to use for KV cache. You can set it to 'fp8' to reduce
+  # memory usage for large contexts.
+  - kv_cache_dtype:
+
+  # Enable chunking the prefill tokens. This greatly reduces memory usage
+  # at high contexts, but it mutually exclusive with kv_cache_dtype=fp8
+  # Takes a boolean value (true/false)
+  - enable_chunked_prefill:
+
+  # By default, Aphrodite Engine reserves 90% of VRAM for every GPU it's using.
+  # Pass a value between 0-1 (e.g. 0.95 for 95%) to increase or decrease this.
+  - gpu_memory_utilization:
+
+  # If your model doesn't fit on the GPU, use this. It takes values in GiB.
+  # e.g., if you pass `10`, it'll virtually add 10 GiB of VRAM to your GPU.
+  # Not recommended because CPU offloading is generally slow.
+  - cpu_offload_gb:
+
+  # This is essentially the maximum batch size. It's set to `256` by default.
+  # You can lower this to use less memory, but it doesn't affect things that much,
+  # unless `enforce_eager` is enabled.
+  - max_num_seqs:
+
+  # Whether to enable CUDA graphs. By default, CUDA graphs are disabled. Pass
+  # `false` here to enable them, and leave blank or pass `true` to keep it disabled.
+  - enforce_eager:
+
+  # The load format to use. You can usually leave this blank.
+  # If you want to use bitsandbytes on-the-fly quantization,
+  # pass `bitsandbytes`, along with `quantization=bitsandbytes`
+  # in the category below.
+  - load_format:
+
+  # Whether or not to enable prefix caching. This will cache
+  # previous prompts so that they're not recomputed. Helps
+  # with large prompts.
+  - enable_prefix_caching:
+
+  # Whether or not to trust remote code in the repository. Needed
+  # for some models that have custom code.
+  - trust_remote_code:
+
+  # The download directory if the `model` is a Hugging Face ID.
+  - download_dir:
+
+  # The data type to use for the model. Can be `auto`, `float16`, `bfloat16`,
+  # `float32`. Defaults to `auto`, which will use fp16 for fp32 and fp16 models,
+  # and bf16 for bf16 models.
+  - dtype:
+
+
+
+# Quantization options.
+quantization_args:
+
+  # The quantization type to use. You don't usually need to pass this,
+  # as the engine will figure out the quant from the model itself.
+  # You may need to use this if you want to perform online quantization,
+  # i.e., quantizing a 16-bit model on-the-fly.
+  # To use FP8 (only supported by Ampere and newer GPUs), pass `fp8`.
+  # To use bitsandbytes, pass `bitsandbytes`.
+  - quantization:
+
+  # Path to the JSON file containing the KV cache scaling factors.
+  # This should generally be supplied when KV cache dtype is FP8.
+  # Otherwise, KV cache scaling factors default to 1.0, which
+  # may cause accuracy issues. FP8_E5M2 (without scaling) is
+  # only supported on CUDA versions greater than 11.8. On ROCm,
+  # FP8_E4M3 is used instead.
+  # For most use cases, you can leave this blank. If you want to
+  # generate scales for your model, look at examples/fp8 directory.
+  - quantization_param_path:
+
+  # The number of floating point bits to use for deepspeed_fp
+  # on-the-fly quantization. Only pass this if you've set
+  # quantization to `deepspeedfp`. Takes 4, 6, 8, 12.
+  - deepspeed_fp_bits:
+
+
+
+# The API-specific options. These are decoupled from the engine.
+api_args:
+
+  # The API key to use for the server. Leave blank to disable API key.
+  - api_keys:
+
+  # The local path or http address to the chat template to use.
+  # This will override the model's existing chat template, if
+  # it has one.
+  - chat_template:
+
+  # When max_logprobs is specified, represents single tokens as
+  # strings of the form `token_ids:{token_id}` so that tokens
+  # that are not JSON-encodable can be identified.
+  - return_tokens_as_token_ids:
+
+
+# These are the options for speculative decoding. Spec Decoding
+# is a way to speed up inference by loading a smaller model
+# and letting it do the predictions, and your main model
+# will only verify its outputs. The outputs will match
+# 1:1 with your main model.
+
+# We currently support the following speculative decoding algorithms:
+# Draft Model, Ngram Prompt Lookup, MLPSpeculator, and Medusa.
+speculative_args:
+  
+  # Use the V2 block manager. Mandatory for speculative decoding.
+  # Takes a boolean value (true/false)
+  - use_v2_block_manager:
+
+  # The speculative model to use. Can take either a Hugging Face ID
+  # or a local path. You can also pass "[ngram]" to use ngram prompt
+  # lookup decoding without needing a draft model.
+  - speculative_model:
+
+  # The number of tokens for the speculative model to predict.
+  # Spec decoding can generate multiple tokens in single forward
+  # pass to speed up inference. Don't set this too high, a good
+  # value is between 3-10, depending on model size.
+  - num_speculative_tokens:
+
+  # The tensor parallel size to use for the speculative model.
+  # Usually, you want this set to 1.
+  - speculative_draft_tensor_parallel_size:
+
+  # The maximum window size for ngram prompt lookup
+  # This needs to be set if you're using ngram prompt lookup
+  - ngram_prompt_lookup_max:
+
+  # The minimum window size for ngram prompt lookup
+  - ngram_prompt_lookup_min:
+
+  # Disable speculative decoding if the number of queued
+  # requests is larger than this value. This is useful
+  # to prevent speculative decoding from using too much
+  # compute.
+  - speculative_disable_by_batch_size:
+
+  # The acceptance method to use for speculative decoding.
+  # Can be either `rejection_sampler` or `typical_acceptance_sampler`.
+  # The default is `rejection_sampler`.
+  # Rejection sampler does not allow changing the acceptance rate
+  # of draft tokens. More accurate but slower.
+  # Typical acceptance sampler allows changing the acceptance rate
+  # of draft tokens. Less accurate but faster.
+  - spec_decoding_acceptance_method:
+
+  # The lower bound threshold for the posterior probability
+  # of a token to be accepted. Only set this if you're using
+  # the typical acceptance sampler. Defaults to 0.09.
+  - typical_acceptance_sampler_posterior_threshold:
+
+  # A scaling factor for the entropy-based threshold for token
+  # acceptance in the typical acceptance sampler. Only set this
+  # if you're using the typical acceptance sampler. Defaults to
+  # sqrt of typical_acceptance_sampler_posterior_threshold, i.e. 0.3.
+  - typical_acceptance_sampler_posterior_alpha:
+
+  # Whether to disable logprobs during speculative decoding.
+  # If True, token log probabilities are not returned. If False,
+  # log probabilities are returned according to the settings
+  # in samplingParams. Defaults to True.
+  # Disabling this (setting to True) speeds up inference
+  # during speculative decoding by skipping log probability
+  # calculation in proposal and target sampling.
+  - disable_logprobs_during_spec_decoding:
+
+
+# The config options for LoRA adapters.
+# Each adapter is treated as a separate model in the API server,
+# and your requests will need to be sent to the specific model.
+lora_args:
+
+  # Whether or not to enable handling LoRA adapters.
+  # Takes a boolean value (true/false)
+  - enable_lora:
+
+
+  # The LoRA adapters to use for the API server.
+  # You can specify multiple adapters here.
+  - lora_modules:
+    # Change the name of the adapter to something more descriptive
+    # e.g. ` - my_sql_lora: /path/to/my_sql_lora`
+    - lora1: 
+    - lora2:
+
+  # The maximum number of LoRA adapters in a single batch.
+  - max_loras:
+
+  # The maximum rank of the LoRA adapters. We currently support
+  # up to 64.
+  - max_lora_rank:
+
+  # The maximum size of extra vocabulary that can be present
+  # in a LoRA adapter (added to the base model vocab)
+  - lora_extra_vocab_size:
+
+  # The data type for the LoRA adapter.
+  # Can take "auto", "float16", "bfloat16", and "float32"
+  - lora_dtype:
+
+  # The maximum number of LoRA adapters to store in CPU memory.
+  # This number must be larger or equal to max_num_seqs.
+  # Defaults to max_num_seqs.
+  - max_cpu_loras:
+
+  # Specify multiple scaling factors (which can be different from base
+  # model scaling factor) to allow for multiple LoRA adapters trained
+  # with those scaling factors to be used at the same time.
+  # If not specified, only adapters trained with the base model scaling
+  # factor are allowed.
+  - long_lora_scaling_factors:
+
+  # By default, only half of the LoRA computation is sharded with tensor
+  # parallelism. Enabling this will use the fully sharded layers. At high
+  # sequence length, max rank, or tensor parallel size, this is likely faster.
+  - fully_sharded_loras:
+
+  # The name or path of the QLoRA adapter to use.
+  - qlora_adapter_name_or_path:
+
+
+# The config options for the Soft Prompt adapters.
+# Soft prompts are a way to tune prompts for a specific task
+# and load them at a request-level.
+soft_prompt_args:
+
+  # Whether or not to enable handling Soft Prompt adapters.
+  # Takes a boolean value (true/false)
+  - enable_prompt_adapter:
+
+  # The Soft Prompt adapters to use for the API server.
+  # You can specify multiple adapters here.
+  - prompt_adapters:
+    # Change the name of the adapter to something more descriptive
+    # e.g. ` - my_sql_prompt: /path/to/my_sql_prompt`
+    - prompt1: 
+    - prompt2:
+
+  # The maximum number of Soft Prompt adapters in a single batch.
+  - max_prompt_adapters:
+
+  # The maximum number of PromptAdapter tokens.
+  - max_prompt_adapter_token:
+
+
+# These are advanced options. You usually don't need to modify these.
+advanced_args:
+
+  # The backend to use for distributed inference. Can be either `ray`
+  # or `mp` (multiprocessing). Defaults to `mp` for single-node,
+  # `ray` for multi-node.
+  # Note that specifying a custom backend by passing a custom class
+  # is intended for expert use only. The API may change without notice.
+  - distributed_executor_backend:
+
+  # The tokenizer to use. Defaults to the model's tokenizer.
+  - tokenizer:
+
+  # The model revision to use if pulling from HF. Defaults to main.
+  - revision:
+
+  # The revision for the remote code in the model repository.
+  - code_revision:
+
+  # The revision for the tokenizer.
+  - tokenizer_revision:
+
+  # The maximum number of tokens to be captured by CUDA graphs.
+  # This is set to 8192 by default. If your prompt exceeds this
+  # threshold, it'll fallback to eager execution.
+  - max_seq_len_to_capture:
+
+  # RoPE scaling config in JSON format.
+  # For example, `{"type": "dynamic", "factor": 2.0}`
+  - rope_scaling:
+
+  # The RoPE theta value. Use with `rope_scaling`. In some cases,
+  # changing the RoPE theta improves performance of the scaled
+  # model.
+  - rope_theta:
+
+  # Extra config for the model loader.
+  # This will be passed to the model loader corresponding
+  # to the chosen load_format. This should be a JSON string that
+  # will be parsed into a dictionary.
+  - model_loader_extra_config:
+
+  # Whether to skip tokenizer and detokenizer initialization.
+  - skip_tokenizer_init:
+
+  # The size of tokenizer pool to use for asynchronous tokenization.
+  # IF 0, will use synchronous tokenization.
+  - tokenizer_pool_size:
+
+  # The type of tokenizer pool to use for asynchronous tokenization.
+  # Ignored if tokenizer_pool_size is 0.
+  # Note that specifying a tokenizer pool by passing a custom class
+  # is intended for expert use only. The API may change without notice.
+  - tokenizer_pool_type:
+
+  # The extra config for tokenizer pool. This should be a JSON string
+  # that will be parsed into a dictionary. Ignored if tokenizer_pool_size
+  # is 0.
+  - tokenizer_pool_extra_config:
+
+  # The maximum log probabilities to return in the API. Defaults to 10.
+  - max_logprobs:
+
+  # The device to use for model execution. You usually don't
+  # need to modify this.
+  # We support `auto`, `cuda`, `neuron`, `cpu`, `openvino`, `tpu`, and `xpu.
+  - device:
+
+  # The pattern(s) to ignore when loading the model.
+  # Defaults to `original/**/*` to avoid repeated loading
+  # of llama's checkpoints.
+  - ignore_patterns:
+
+  # If specified, use nsight to profile ray workers.
+  - ray_workers_use_nsight:
+
+  # If specified, disable the custom all-reduce kernels.
+  # They're enabled by default for GPUs with P2P support.
+  - disable_custom_all_reduce:
+
+  # The preemption mode to use for the scheduler. If `recompute`,
+  # the engine performs preemption by block recomputation. If `swap`,
+  # the engine performs preemption by block swapping.
+  - preemption_mode:
+
+  # If specified, ignore GPU profiling result and use this
+  # number of GPU blocks. Only used for testing.
+  - num_gpu_blocks_override:
+
+  # The CPU swap space size (GiB) per GPU. Not related to CPU offloading.
+  - swap_space:
+
+  # Whether to disable sliding window.
+  - disable_sliding_window:
+
+  # The token block size. Takes values between 8, 16, 32.
+  - block_size: