Quellcode durchsuchen

chore: consolidate environment variables within one file (#882)

AlpinDale vor 3 Monaten
Ursprung
Commit
901900854e
36 geänderte Dateien mit 543 neuen und 132 gelöschten Zeilen
  1. 3 7
      aphrodite/assets/base.py
  2. 2 4
      aphrodite/attention/backends/rocm_flash_attn.py
  3. 2 1
      aphrodite/attention/selector.py
  4. 34 16
      aphrodite/common/config.py
  5. 3 2
      aphrodite/common/logger.py
  6. 3 3
      aphrodite/common/sampling_params.py
  7. 8 11
      aphrodite/common/utils.py
  8. 2 2
      aphrodite/distributed/device_communicators/custom_all_reduce.py
  9. 7 6
      aphrodite/distributed/device_communicators/custom_all_reduce_utils.py
  10. 3 3
      aphrodite/distributed/device_communicators/shm_broadcast.py
  11. 3 2
      aphrodite/distributed/parallel_state.py
  12. 3 2
      aphrodite/distributed/utils.py
  13. 2 1
      aphrodite/endpoints/openai/api_server.py
  14. 2 3
      aphrodite/engine/aphrodite_engine.py
  15. 2 3
      aphrodite/engine/args_tools.py
  16. 2 3
      aphrodite/engine/async_aphrodite.py
  17. 388 0
      aphrodite/envs.py
  18. 2 1
      aphrodite/executor/cpu_executor.py
  19. 3 1
      aphrodite/executor/multiproc_worker_utils.py
  20. 4 5
      aphrodite/executor/openvino_executor.py
  21. 7 8
      aphrodite/executor/ray_gpu_executor.py
  22. 2 1
      aphrodite/executor/ray_tpu_executor.py
  23. 2 1
      aphrodite/executor/ray_xpu_executor.py
  24. 2 2
      aphrodite/modeling/layers/fused_moe/fused_moe.py
  25. 2 3
      aphrodite/modeling/layers/sampler.py
  26. 3 3
      aphrodite/modeling/model_loader/openvino.py
  27. 4 3
      aphrodite/modeling/model_loader/tensorizer.py
  28. 3 5
      aphrodite/multimodal/utils.py
  29. 3 5
      aphrodite/plugins/__init__.py
  30. 2 3
      aphrodite/quantization/fp8.py
  31. 3 3
      aphrodite/server/launch.py
  32. 2 3
      aphrodite/task_handler/cpu_worker.py
  33. 2 2
      aphrodite/task_handler/tpu_worker.py
  34. 2 2
      aphrodite/transformers_utils/config.py
  35. 4 5
      examples/tensorize_aphrodite_model.py
  36. 22 7
      setup.py

+ 3 - 7
aphrodite/assets/base.py

@@ -5,6 +5,7 @@ from functools import lru_cache
 from pathlib import Path
 from typing import Optional
 
+from aphrodite import envs
 from aphrodite.connections import global_http_connection
 
 
@@ -15,13 +16,8 @@ def get_default_cache_root():
     )
 
 vLLM_S3_BUCKET_URL = "https://vllm-public-assets.s3.us-west-2.amazonaws.com"
-APHRODITE_ASSETS_CACHE = os.path.expanduser(
-    os.getenv(
-        "APHRODITE_ASSETS_CACHE",
-        os.path.join(get_default_cache_root(), "aphrodite", "assets"),
-    ))
-APHRODITE_IMAGE_FETCH_TIMEOUT = int(os.getenv("APHRODITE_IMAGE_FETCH_TIMEOUT",
-                                              5))
+APHRODITE_ASSETS_CACHE = envs.APHRODITE_ASSETS_CACHE
+APHRODITE_IMAGE_FETCH_TIMEOUT = envs.APHRODITE_IMAGE_FETCH_TIMEOUT
 
 def get_cache_dir() -> Path:
     """Get the path to the cache for storing downloaded assets."""

+ 2 - 4
aphrodite/attention/backends/rocm_flash_attn.py

@@ -1,11 +1,11 @@
 """Attention layer ROCm GPUs."""
-import os
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Type
 
 import torch
 from loguru import logger
 
+from aphrodite import envs
 from aphrodite.attention.backends.abstract import (AttentionBackend,
                                                    AttentionImpl,
                                                    AttentionMetadata,
@@ -280,9 +280,7 @@ class ROCmFlashAttentionImpl(AttentionImpl):
 
         self.use_naive_attn = False
         # NOTE: Allow for switching between Triton and CK. Defaulting to triton.
-        self.use_triton_flash_attn = (os.environ.get(
-            "APHRODITE_USE_TRITON_FLASH_ATTN", "True").lower()
-                                      in ("true", "1"))
+        self.use_triton_flash_attn = envs.APHRODITE_USE_TRITON_FLASH_ATTN
         if self.use_triton_flash_attn:
             from aphrodite.attention.ops.triton_flash_attn import (  # noqa: F401
                 triton_attention)

+ 2 - 1
aphrodite/attention/selector.py

@@ -7,12 +7,13 @@ from typing import Generator, Optional, Type
 import torch
 from loguru import logger
 
+from aphrodite import envs
 from aphrodite.attention.backends.abstract import AttentionBackend
 from aphrodite.common.utils import (STR_BACKEND_ENV_VAR, is_cpu, is_hip,
                                     is_openvino, is_xpu)
 from aphrodite.platforms import current_platform
 
-APHRODITE_ATTENTION_BACKEND = os.getenv("APHRODITE_ATTENTION_BACKEND", None)
+APHRODITE_ATTENTION_BACKEND = envs.APHRODITE_ATTENTION_BACKEND
 
 
 class _Backend(enum.Enum):

+ 34 - 16
aphrodite/common/config.py

@@ -9,6 +9,7 @@ import torch
 from loguru import logger
 from transformers import PretrainedConfig
 
+from aphrodite import envs
 from aphrodite.common.utils import (STR_NOT_IMPL_ENC_DEC_CUDAGRAPH, GiB_bytes,
                                     cuda_device_count_stateless,
                                     get_cpu_memory, is_cpu, is_hip, is_neuron,
@@ -30,8 +31,7 @@ if TYPE_CHECKING:
         BaseTokenizerGroup)
 
 # If true, will load models from ModelScope instead of Hugging Face Hub.
-APHRODITE_USE_MODELSCOPE = os.environ.get("APHRODITE_USE_MODELSCOPE",
-                                          "False").lower() == "true"
+APHRODITE_USE_MODELSCOPE = envs.APHRODITE_USE_MODELSCOPE
 
 _EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768
 
@@ -1820,21 +1820,39 @@ def _get_and_verify_max_len(
                     "original_max_position_embeddings"]
             derived_max_model_len *= scaling_factor
 
+    # If the user specified a max length, make sure it is smaller than the
+    # derived length from the HF model config.
     if max_model_len is None:
-        max_model_len = derived_max_model_len
-    elif max_model_len > derived_max_model_len and rope_scaling_arg is None:
-        raise ValueError(
-            f"User-specified max_model_len {max_model_len} is higher than "
-            f"the original {derived_max_model_len}. "
-            "Please provide a rope_scaling dict to scale the model.")
-    elif max_model_len > derived_max_model_len and rope_scaling_arg is not None:
-        # hope this works
-        logger.warning(
-            f"User-specified max_model_len {max_model_len} is higher than "
-            f"the original {derived_max_model_len}. "
-            "Attempting to use RoPE scaling with the provided rope_scaling "
-            "dict.")
-        derived_max_model_len = max_model_len
+        max_model_len = int(derived_max_model_len)
+    elif max_model_len > derived_max_model_len:
+        # Some models might have a separate key for specifying model_max_length
+        # that will be bigger than derived_max_model_len. We compare user input
+        # with model_max_length and allow this override when it's smaller.
+        model_max_length = getattr(hf_config, "model_max_length", None)
+        if envs.APHRODITE_DYNAMIC_ROPE_SCALING:
+            scaling_factor = max_model_len / derived_max_model_len
+            hf_config.rope_scaling = {"factor": scaling_factor,
+                                      "type": "dynamic"}
+            logger.info(
+                "Using dynamic RoPE scaling to extend the model's max context "
+                f"length from {derived_max_model_len} to {max_model_len}.")
+            derived_max_model_len = max_model_len
+        elif model_max_length is not None and max_model_len <= model_max_length:
+            if disable_sliding_window:
+                # TODO: Find a model that has model_max_length
+                # with sliding window to see if this case should be allowed.
+                raise NotImplementedError(
+                    "Disabling sliding window is not supported for models "
+                    "model_max_length in the config. Please raise an issue "
+                    "so we can investigate.")
+        else:
+            raise ValueError(
+                f"User-specified max_model_len ({max_model_len}) is greater "
+                f"than the derived max_model_len ({max_len_key}="
+                f"{derived_max_model_len} or model_max_length="
+                f"{model_max_length} in model's config.json). To allow "
+                "greater lengths, please set the env var "
+                "APHRODITE_DYNAMIC_ROPE_SCALING=1")
 
     return int(max_model_len)
 

+ 3 - 2
aphrodite/common/logger.py

@@ -15,11 +15,12 @@ from rich.markup import escape
 from rich.progress import (BarColumn, MofNCompleteColumn, Progress,
                            TaskProgressColumn, TextColumn, TimeRemainingColumn)
 
+from aphrodite import envs
+
 RICH_CONSOLE = Console()
 LOG_LEVEL = os.getenv("APHRODITE_LOG_LEVEL", "INFO").upper()
 
-APHRODITE_CONFIGURE_LOGGING = int(os.getenv("APHRODITE_CONFIGURE_LOGGING",
-                                            "1"))
+APHRODITE_CONFIGURE_LOGGING = envs.APHRODITE_CONFIGURE_LOGGING
 
 
 def unwrap(wrapped, default=None):

+ 3 - 3
aphrodite/common/sampling_params.py

@@ -1,6 +1,5 @@
 """Sampling parameters for text generation."""
 import copy
-import os
 from enum import IntEnum
 from functools import cached_property
 from typing import Any, Callable, Dict, List, Optional, Set, Union
@@ -10,11 +9,12 @@ import torch
 from loguru import logger
 from typing_extensions import Annotated
 
+from aphrodite import envs
+
 _SAMPLING_EPS = 1e-5
 _MAX_TEMP = 1e-2
 
-APHRODITE_NO_DEPRECATION_WARNING = bool(
-    int(os.environ.get("APHRODITE_NO_DEPRECATION_WARNING", "0")))
+APHRODITE_NO_DEPRECATION_WARNING = envs.APHRODITE_NO_DEPRECATION_WARNING
 
 
 class SamplingType(IntEnum):

+ 8 - 11
aphrodite/common/utils.py

@@ -31,6 +31,7 @@ from rich.progress import (BarColumn, MofNCompleteColumn, Progress,
                            SpinnerColumn, TextColumn, TimeElapsedColumn)
 from typing_extensions import ParamSpec, TypeIs, assert_never
 
+from aphrodite import envs
 from aphrodite.common.logger import enable_trace_function_call
 from aphrodite.distributed import get_tensor_model_parallel_rank
 
@@ -382,8 +383,7 @@ def get_aphrodite_instance_id():
     Instance id represents an instance of the Aphrodite. All processes in the
     same instance should have the same instance id.
     """
-    return os.environ.get("APHRODITE_INSTANCE_ID",
-                          f"aphrodite-instance-{random_uuid()}")
+    return envs.APHRODITE_INSTANCE_ID or f"aphrodite-instance-{random_uuid()}"
 
 
 @lru_cache(maxsize=None)
@@ -520,9 +520,7 @@ def get_distributed_init_method(ip: str, port: int) -> str:
 
 def get_open_zmq_ipc_path() -> str:
     if not in_windows():
-        APHRODITE_RPC_BASE_PATH = os.getenv("APHRODITE_RPC_BASE_PATH",
-                                        tempfile.gettempdir())
-        base_rpc_path = APHRODITE_RPC_BASE_PATH
+        base_rpc_path = envs.APHRODITE_RPC_BASE_PATH
         return f"ipc://{base_rpc_path}/{uuid4()}"
     else:
         # windows doesn't support ipc://
@@ -530,8 +528,7 @@ def get_open_zmq_ipc_path() -> str:
         return f"tcp://127.0.0.1:{get_open_port()}"
      
 def get_open_port(port: Optional[int] = None) -> int:
-    port = int(os.getenv("APHRODITE_PORT", 0)
-                ) if "APHRODITE_PORT" in os.environ else None
+    port = envs.APHRODITE_PORT
     if port is not None:
         while True:
             try:
@@ -948,7 +945,7 @@ def find_library(lib_name: str) -> str:
     # libcuda.so.1 (libc6,x86-64) => /lib/x86_64-linux-gnu/libcuda.so.1
     locs = [line.split()[-1] for line in libs.splitlines() if lib_name in line]
     # `LD_LIBRARY_PATH` searches the library in the user-defined paths
-    env_ld_library_path = os.getenv("LD_LIBRARY_PATH")
+    env_ld_library_path = envs.LD_LIBRARY_PATH
     if not locs and env_ld_library_path:
         locs = [
             os.path.join(dir, lib_name)
@@ -967,7 +964,7 @@ def find_nccl_library() -> str:
     After importing `torch`, `libnccl.so.2` or `librccl.so.1` can be
     found by `ctypes` automatically.
     """
-    so_file = os.environ.get("APHRODITE_NCCL_SO_PATH", "")
+    so_file = envs.APHRODITE_NCCL_SO_PATH
 
     # manually load the nccl library
     if so_file:
@@ -985,7 +982,7 @@ def find_nccl_library() -> str:
 
 
 def enable_trace_function_call_for_thread() -> None:
-    if int(os.getenv("APHRODITE_TRACE_FUNCTION", "0")):
+    if envs.APHRODITE_TRACE_FUNCTION:
         tmp_dir = tempfile.gettempdir()
         filename = (f"APHRODITE_TRACE_FUNCTION_for_process_{os.getpid()}"
                     f"_thread_{threading.get_ident()}_"
@@ -1074,7 +1071,7 @@ def cuda_device_count_stateless() -> int:
     # This can be removed and simply replaced with torch.cuda.get_device_count
     # after https://github.com/pytorch/pytorch/pull/122815 is released.
 
-    return _cuda_device_count_stateless(os.environ.get("CUDA_VISIBLE_DEVICES"))
+    return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES)
 
 
 #From: https://stackoverflow.com/a/4104188/2749989

+ 2 - 2
aphrodite/distributed/device_communicators/custom_all_reduce.py

@@ -1,4 +1,3 @@
-import os
 from contextlib import contextmanager
 from typing import Any, List, Optional, Union
 
@@ -8,6 +7,7 @@ from loguru import logger
 from torch.distributed import ProcessGroup
 
 from aphrodite import _custom_ops as ops
+from aphrodite import envs
 from aphrodite.common.utils import cuda_device_count_stateless
 from aphrodite.distributed.device_communicators.custom_all_reduce_utils import (
     gpu_p2p_access_check)
@@ -95,7 +95,7 @@ class CustomAllreduce:
         assert isinstance(device, torch.device)
         self.device = device
 
-        cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
+        cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
         if cuda_visible_devices:
             device_ids = list(map(int, cuda_visible_devices.split(",")))
         else:

+ 7 - 6
aphrodite/distributed/device_communicators/custom_all_reduce_utils.py

@@ -11,6 +11,7 @@ import torch.distributed as dist
 import torch.multiprocessing as mp
 from loguru import logger
 
+from aphrodite import envs
 from aphrodite.common.utils import (cuda_device_count_stateless,
                                     update_environment_variables)
 from aphrodite.distributed.device_communicators.cuda_wrapper import (
@@ -124,7 +125,7 @@ def can_actually_p2p(
     processes for testing all pairs of GPUs in batch. The trick is to reset
     the device after each test (which is not available in PyTorch).
     """  # noqa
-    cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None)
+    cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
     # pass the CUDA_VISIBLE_DEVICES to the child process
     # to make sure they see the same set of GPUs
 
@@ -183,13 +184,13 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
     is_distributed = dist.is_initialized()
 
     num_dev = cuda_device_count_stateless()
-    cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None)
+    cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
     if cuda_visible_devices is None:
         cuda_visible_devices = ",".join(str(i) for i in range(num_dev))
-    APHRODITE_CONFIG_ROOT = os.getenv("APHRODITE_CONFIG_ROOT", "~/.config")
-    path = os.path.expanduser(
-        f"{APHRODITE_CONFIG_ROOT}/aphrodite/gpu_p2p_access_cache_for_{cuda_visible_devices}.json"
-    )
+
+    path = os.path.join(
+        envs.APHRODITE_CACHE_ROOT,
+        f"gpu_p2p_access_cache_for_{cuda_visible_devices}.json")
     os.makedirs(os.path.dirname(path), exist_ok=True)
     from aphrodite.distributed.parallel_state import get_world_group
     if ((not is_distributed or get_world_group().local_rank == 0)

+ 3 - 3
aphrodite/distributed/device_communicators/shm_broadcast.py

@@ -1,4 +1,3 @@
-import os
 import pickle
 import time
 from contextlib import contextmanager
@@ -13,10 +12,11 @@ from loguru import logger
 from torch.distributed import ProcessGroup
 from zmq import SUB, SUBSCRIBE, XPUB, XPUB_VERBOSE, Context  # type: ignore
 
+from aphrodite import envs
 from aphrodite.common.utils import get_ip, get_open_port
 
-APHRODITE_RINGBUFFER_WARNING_INTERVAL = os.getenv(
-    "APHRODITE_RINGBUFFER_WARNING_INTERVAL", 60)
+APHRODITE_RINGBUFFER_WARNING_INTERVAL = (
+    envs.APHRODITE_RINGBUFFER_WARNING_INTERVAL)
 
 # time to wait if the queue is full or empty
 # if we sleep for too short, it will consume too much CPU

+ 3 - 2
aphrodite/distributed/parallel_state.py

@@ -21,7 +21,6 @@ If you only need to use the distributed environment without model/pipeline
  steps.
 """
 import contextlib
-import os
 import pickle
 import sys
 from collections import namedtuple
@@ -36,6 +35,8 @@ import torch.distributed
 from loguru import logger
 from torch.distributed import Backend, ProcessGroup
 
+from aphrodite import envs
+
 
 @dataclass
 class GraphCaptureContext:
@@ -866,7 +867,7 @@ def init_distributed_environment(
         # local rank not set, this usually happens in single-node
         # setting, where we can use rank as local rank
         if distributed_init_method == "env://":
-            local_rank = os.getenv("LOCAL_RANK", rank)
+            local_rank = envs.LOCAL_RANK
         else:
             local_rank = rank
     global _WORLD

+ 3 - 2
aphrodite/distributed/utils.py

@@ -3,12 +3,13 @@
 # Adapted from
 # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py
 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-import os
 from typing import Sequence, Tuple
 
 import torch
 
-APHRODITE_PP_LAYER_PARTITION = os.getenv("APHRODITE_PP_LAYER_PARTITION", None)
+from aphrodite import envs
+
+APHRODITE_PP_LAYER_PARTITION = envs.APHRODITE_PP_LAYER_PARTITION
 
 
 def ensure_divisibility(numerator, denominator):

+ 2 - 1
aphrodite/endpoints/openai/api_server.py

@@ -20,6 +20,7 @@ from fastapi.responses import (HTMLResponse, JSONResponse, Response,
 from loguru import logger
 from starlette.routing import Mount
 
+from aphrodite import envs
 from aphrodite.common.config import ModelConfig
 from aphrodite.common.outputs import RequestOutput
 from aphrodite.common.sampling_params import _SAMPLING_EPS, SamplingParams
@@ -635,7 +636,7 @@ def build_app(args: Namespace) -> FastAPI:
         return JSONResponse(err.model_dump(),
                             status_code=HTTPStatus.BAD_REQUEST)
 
-    if token := os.environ.get("APHRODITE_API_KEY") or args.api_keys:
+    if token := envs.APHRODITE_API_KEY or args.api_keys:
         admin_key = os.environ.get("APHRODITE_ADMIN_KEY") or args.admin_key
 
         if admin_key is None:

+ 2 - 3
aphrodite/engine/aphrodite_engine.py

@@ -1,4 +1,3 @@
-import os
 import time
 from contextlib import contextmanager
 from typing import TYPE_CHECKING, Any, ClassVar, Dict, Iterable, List, Optional
@@ -9,6 +8,7 @@ from loguru import logger
 from transformers import PreTrainedTokenizer
 from typing_extensions import assert_never
 
+from aphrodite import envs
 from aphrodite.common.config import (CacheConfig, DecodingConfig, DeviceConfig,
                                      EngineConfig, LoadConfig, LoRAConfig,
                                      ModelConfig, ParallelConfig,
@@ -50,8 +50,7 @@ from aphrodite.version import __version__ as APHRODITE_VERSION
 
 _LOCAL_LOGGING_INTERVAL_SEC = 5
 
-APHRODITE_USE_RAY_SPMD_WORKER = bool(
-    os.getenv("APHRODITE_USE_RAY_SPMD_WORKER", 0))
+APHRODITE_USE_RAY_SPMD_WORKER = envs.APHRODITE_USE_RAY_SPMD_WORKER
 
 
 def _load_generation_config_dict(model_config: ModelConfig) -> Dict[str, Any]:

+ 2 - 3
aphrodite/engine/args_tools.py

@@ -1,13 +1,13 @@
 import argparse
 import dataclasses
 import json
-import os
 from dataclasses import dataclass
 from typing import (TYPE_CHECKING, Dict, List, Mapping, Optional, Tuple, Type,
                     Union)
 
 from loguru import logger
 
+from aphrodite import envs
 from aphrodite.common.config import (CacheConfig, ConfigFormat, DecodingConfig,
                                      DeviceConfig, EngineConfig, LoadConfig,
                                      LoadFormat, LoRAConfig, ModelConfig,
@@ -24,8 +24,7 @@ if TYPE_CHECKING:
     from aphrodite.transformers_utils.tokenizer_group import BaseTokenizerGroup
 
 
-APHRODITE_USE_RAY_SPMD_WORKER = bool(
-    os.getenv("APHRODITE_USE_RAY_SPMD_WORKER", 0))
+APHRODITE_USE_RAY_SPMD_WORKER = envs.APHRODITE_USE_RAY_SPMD_WORKER
 
 def nullable_kvs(val: str) -> Optional[Mapping[str, int]]:
     if len(val) == 0:

+ 2 - 3
aphrodite/engine/async_aphrodite.py

@@ -1,5 +1,4 @@
 import asyncio
-import os
 import time
 from dataclasses import dataclass
 from functools import partial
@@ -11,6 +10,7 @@ from loguru import logger
 from transformers import PreTrainedTokenizer
 from typing_extensions import assert_never
 
+from aphrodite import envs
 from aphrodite.common.config import (DecodingConfig, EngineConfig, LoRAConfig,
                                      ModelConfig, ParallelConfig,
                                      SchedulerConfig)
@@ -34,8 +34,7 @@ from aphrodite.lora.request import LoRARequest
 from aphrodite.processing.scheduler import SchedulerOutputs
 from aphrodite.prompt_adapter.request import PromptAdapterRequest
 
-ENGINE_ITERATION_TIMEOUT_S = int(
-    os.environ.get("APHRODITE_ENGINE_ITERATION_TIMEOUT_S", "60"))
+ENGINE_ITERATION_TIMEOUT_S = envs.APHRODITE_ENGINE_ITERATION_TIMEOUT_S
 
 
 class AsyncEngineDeadError(RuntimeError):

+ 388 - 0
aphrodite/envs.py

@@ -0,0 +1,388 @@
+import os
+import tempfile
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
+
+if TYPE_CHECKING:
+    APHRODITE_HOST_IP: str = ""
+    APHRODITE_PORT: Optional[int] = None
+    APHRODITE_RPC_BASE_PATH: str = tempfile.gettempdir()
+    APHRODITE_USE_MODELSCOPE: bool = False
+    APHRODITE_RINGBUFFER_WARNING_INTERVAL: int = 60
+    APHRODITE_INSTANCE_ID: Optional[str] = None
+    APHRODITE_NCCL_SO_PATH: Optional[str] = None
+    LD_LIBRARY_PATH: Optional[str] = None
+    APHRODITE_USE_TRITON_FLASH_ATTN: bool = False
+    LOCAL_RANK: int = 0
+    CUDA_VISIBLE_DEVICES: Optional[str] = None
+    APHRODITE_ENGINE_ITERATION_TIMEOUT_S: int = 60
+    APHRODITE_API_KEY: Optional[str] = None
+    S3_ACCESS_KEY_ID: Optional[str] = None
+    S3_SECRET_ACCESS_KEY: Optional[str] = None
+    S3_ENDPOINT_URL: Optional[str] = None
+    APHRODITE_CACHE_ROOT: str = os.path.expanduser("~/.cache/aphrodite")
+    APHRODITE_CONFIG_ROOT: str = os.path.expanduser("~/.config/aphrodite")
+    APHRODITE_CONFIGURE_LOGGING: int = 1
+    APHRODITE_LOGGING_LEVEL: str = "INFO"
+    APHRODITE_LOGGING_CONFIG_PATH: Optional[str] = None
+    APHRODITE_TRACE_FUNCTION: int = 0
+    APHRODITE_ATTENTION_BACKEND: Optional[str] = None
+    APHRODITE_USE_SAMPLING_KERNELS: bool = False
+    APHRODITE_PP_LAYER_PARTITION: Optional[str] = None
+    APHRODITE_CPU_KVCACHE_SPACE: int = 0
+    APHRODITE_CPU_OMP_THREADS_BIND: str = ""
+    APHRODITE_OPENVINO_KVCACHE_SPACE: int = 0
+    APHRODITE_OPENVINO_CPU_KV_CACHE_PRECISION: Optional[str] = None
+    APHRODITE_OPENVINO_ENABLE_QUANTIZED_WEIGHTS: bool = False
+    APHRODITE_XLA_CACHE_PATH: str = os.path.join(APHRODITE_CACHE_ROOT, "xla_cache")  # noqa: E501
+    APHRODITE_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024
+    APHRODITE_USE_RAY_SPMD_WORKER: bool = False
+    APHRODITE_USE_RAY_COMPILED_DAG: bool = False
+    APHRODITE_USE_RAY_COMPILED_DAG_NCCL_CHANNEL: bool = True
+    APHRODITE_WORKER_MULTIPROC_METHOD: str = "fork"
+    APHRODITE_ASSETS_CACHE: str = os.path.join(APHRODITE_CACHE_ROOT, "assets")
+    APHRODITE_IMAGE_FETCH_TIMEOUT: int = 5
+    APHRODITE_AUDIO_FETCH_TIMEOUT: int = 5
+    APHRODITE_TARGET_DEVICE: str = "cuda"
+    MAX_JOBS: Optional[str] = None
+    NVCC_THREADS: Optional[str] = None
+    APHRODITE_USE_PRECOMPILED: bool = False
+    APHRODITE_NO_DEPRECATION_WARNING: bool = False
+    APHRODITE_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
+    CMAKE_BUILD_TYPE: Optional[str] = None
+    VERBOSE: bool = False
+    APHRODITE_DYNAMIC_ROPE_SCALING: bool = False
+    APHRODITE_TEST_FORCE_FP8_MARLIN: bool = False
+    APHRODITE_ALLOW_ENGINE_USE_RAY: bool = False
+    APHRODITE_PLUGINS: Optional[List[str]] = None
+
+
+def get_default_cache_root():
+    return os.getenv(
+        "XDG_CACHE_HOME",
+        os.path.join(os.path.expanduser("~"), ".cache"),
+    )
+
+
+def get_default_config_root():
+    return os.getenv(
+        "XDG_CONFIG_HOME",
+        os.path.join(os.path.expanduser("~"), ".config"),
+    )
+
+
+# The begin-* and end* here are used by the documentation generator
+# to extract the used env vars.
+
+# begin-env-vars-definition
+
+environment_variables: Dict[str, Callable[[], Any]] = {
+
+    # ================== Installation Time Env Vars ==================
+
+    # Target device of Aphrodite, supporting [cuda (by default),
+    # rocm, neuron, cpu, openvino]
+    "APHRODITE_TARGET_DEVICE":
+    lambda: os.getenv("APHRODITE_TARGET_DEVICE", "cuda"),
+
+    # Maximum number of compilation jobs to run in parallel.
+    # By default this is the number of CPUs
+    "MAX_JOBS":
+    lambda: os.getenv("MAX_JOBS", None),
+
+    # Number of threads to use for nvcc
+    # By default this is 1.
+    # If set, `MAX_JOBS` will be reduced to avoid oversubscribing the CPU.
+    "NVCC_THREADS":
+    lambda: os.getenv("NVCC_THREADS", None),
+
+    # If set, Aphrodite will use precompiled binaries (*.so)
+    "APHRODITE_USE_PRECOMPILED":
+    lambda: bool(os.environ.get("APHRODITE_USE_PRECOMPILED")),
+
+    # CMake build type
+    # If not set, defaults to "Debug" or "RelWithDebInfo"
+    # Available options: "Debug", "Release", "RelWithDebInfo"
+    "CMAKE_BUILD_TYPE":
+    lambda: os.getenv("CMAKE_BUILD_TYPE"),
+
+    # If set, Aphrodite will print verbose logs during installation
+    "VERBOSE":
+    lambda: bool(int(os.getenv('VERBOSE', '0'))),
+
+    # Root directory for APHRODITE configuration files
+    # Defaults to `~/.config/aphrodite` unless `XDG_CONFIG_HOME` is set
+    # Note that this not only affects how aphrodite finds its configuration
+    # files during runtime, but also affects how aphrodite installs its
+    # configuration files during **installation**.
+    "APHRODITE_CONFIG_ROOT":
+    lambda: os.path.expanduser(
+        os.getenv(
+            "APHRODITE_CONFIG_ROOT",
+            os.path.join(get_default_config_root(), "aphrodite"),
+        )),
+
+    # ================== Runtime Env Vars ==================
+
+    # Root directory for APHRODITE cache files
+    # Defaults to `~/.cache/aphrodite` unless `XDG_CACHE_HOME` is set
+    "APHRODITE_CACHE_ROOT":
+    lambda: os.path.expanduser(
+        os.getenv(
+            "APHRODITE_CACHE_ROOT",
+            os.path.join(get_default_cache_root(), "aphrodite"),
+        )),
+
+    # used in distributed environment to determine the master address
+    'APHRODITE_HOST_IP':
+    lambda: os.getenv('APHRODITE_HOST_IP', "") or os.getenv("HOST_IP", ""),
+
+    # used in distributed environment to manually set the communication port
+    # Note: if APHRODITE_PORT is set, and some code asks for multiple ports, the
+    # APHRODITE_PORT will be used as the first port, and the rest will be
+    # generated by incrementing the APHRODITE_PORT value.
+    # '0' is used to make mypy happy
+    'APHRODITE_PORT':
+    lambda: int(os.getenv('APHRODITE_PORT', '0'))
+    if 'APHRODITE_PORT' in os.environ else None,
+
+    # path used for ipc when the frontend api server is running in
+    # multi-processing mode to communicate with the backend engine process.
+    'APHRODITE_RPC_BASE_PATH':
+    lambda: os.getenv('APHRODITE_RPC_BASE_PATH', tempfile.gettempdir()),
+
+    # If true, will load models from ModelScope instead of Hugging Face Hub.
+    # note that the value is true or false, not numbers
+    "APHRODITE_USE_MODELSCOPE":
+    lambda: os.environ.get(
+        "APHRODITE_USE_MODELSCOPE", "False").lower() == "true",
+
+    # Instance id represents an instance of the APHRODITE. All processes in the
+    # same instance should have the same instance id.
+    "APHRODITE_INSTANCE_ID":
+    lambda: os.environ.get("APHRODITE_INSTANCE_ID", None),
+
+    # Interval in seconds to log a warning message when the ring buffer is full
+    "APHRODITE_RINGBUFFER_WARNING_INTERVAL":
+    lambda: int(os.environ.get("APHRODITE_RINGBUFFER_WARNING_INTERVAL", "60")),
+
+    # path to cudatoolkit home directory, under which should be bin, include,
+    # and lib directories.
+    "CUDA_HOME":
+    lambda: os.environ.get("CUDA_HOME", None),
+
+    # Path to the NCCL library file. It is needed because nccl>=2.19 brought
+    # by PyTorch contains a bug: https://github.com/NVIDIA/nccl/issues/1234
+    "APHRODITE_NCCL_SO_PATH":
+    lambda: os.environ.get("APHRODITE_NCCL_SO_PATH", None),
+
+    # when `APHRODITE_NCCL_SO_PATH` is not set, aphrodite will try to find the
+    # nccl library file in the locations specified by `LD_LIBRARY_PATH`
+    "LD_LIBRARY_PATH":
+    lambda: os.environ.get("LD_LIBRARY_PATH", None),
+
+    # flag to control if aphrodite should use triton flash attention
+    "APHRODITE_USE_TRITON_FLASH_ATTN":
+    lambda: (os.environ.get(
+        "APHRODITE_USE_TRITON_FLASH_ATTN", "True").lower() in ("true", "1")),
+
+    # Internal flag to enable Dynamo graph capture
+    "APHRODITE_TEST_DYNAMO_GRAPH_CAPTURE":
+    lambda: int(os.environ.get("APHRODITE_TEST_DYNAMO_GRAPH_CAPTURE", "0")),
+
+    # local rank of the process in the distributed setting, used to determine
+    # the GPU device id
+    "LOCAL_RANK":
+    lambda: int(os.environ.get("LOCAL_RANK", "0")),
+
+    # used to control the visible devices in the distributed setting
+    "CUDA_VISIBLE_DEVICES":
+    lambda: os.environ.get("CUDA_VISIBLE_DEVICES", None),
+
+    # timeout for each iteration in the engine
+    "APHRODITE_ENGINE_ITERATION_TIMEOUT_S":
+    lambda: int(os.environ.get("APHRODITE_ENGINE_ITERATION_TIMEOUT_S", "60")),
+
+    # API key for APHRODITE API server
+    "APHRODITE_API_KEY":
+    lambda: os.environ.get("APHRODITE_API_KEY", None),
+
+    # S3 access information, used for tensorizer to load model from S3
+    "S3_ACCESS_KEY_ID":
+    lambda: os.environ.get("S3_ACCESS_KEY_ID", None),
+    "S3_SECRET_ACCESS_KEY":
+    lambda: os.environ.get("S3_SECRET_ACCESS_KEY", None),
+    "S3_ENDPOINT_URL":
+    lambda: os.environ.get("S3_ENDPOINT_URL", None),
+
+    # Logging configuration
+    # If set to 0, aphrodite will not configure logging
+    # If set to 1, aphrodite will configure logging using the default
+    # configuration or the configuration file specified by
+    # APHRODITE_LOGGING_CONFIG_PATH
+    "APHRODITE_CONFIGURE_LOGGING":
+    lambda: int(os.getenv("APHRODITE_CONFIGURE_LOGGING", "1")),
+    "APHRODITE_LOGGING_CONFIG_PATH":
+    lambda: os.getenv("APHRODITE_LOGGING_CONFIG_PATH"),
+
+    # this is used for configuring the default logging level
+    "APHRODITE_LOGGING_LEVEL":
+    lambda: os.getenv("APHRODITE_LOGGING_LEVEL", "INFO"),
+
+    # Trace function calls
+    # If set to 1, aphrodite will trace function calls
+    # Useful for debugging
+    "APHRODITE_TRACE_FUNCTION":
+    lambda: int(os.getenv("APHRODITE_TRACE_FUNCTION", "0")),
+
+    # Backend for attention computation
+    # Available options:
+    # - "TORCH_SDPA": use torch.nn.MultiheadAttention
+    # - "FLASH_ATTN": use FlashAttention
+    # - "XFORMERS": use XFormers
+    # - "ROCM_FLASH": use ROCmFlashAttention
+    # - "FLASHINFER": use flashinfer
+    "APHRODITE_ATTENTION_BACKEND":
+    lambda: os.getenv("APHRODITE_ATTENTION_BACKEND", None),
+
+    # If set, aphrodite will use flashinfer sampler
+    "APHRODITE_USE_SAMPLING_KERNELS":
+    lambda: bool(int(os.getenv("APHRODITE_USE_SAMPLING_KERNELS", "0"))),
+
+    # Pipeline stage partition strategy
+    "APHRODITE_PP_LAYER_PARTITION":
+    lambda: os.getenv("APHRODITE_PP_LAYER_PARTITION", None),
+
+    # (CPU backend only) CPU key-value cache space.
+    # default is 4GB
+    "APHRODITE_CPU_KVCACHE_SPACE":
+    lambda: int(os.getenv("APHRODITE_CPU_KVCACHE_SPACE", "0")),
+
+    # (CPU backend only) CPU core ids bound by OpenMP threads, e.g., "0-31",
+    # "0,1,2", "0-31,33". CPU cores of different ranks are separated by '|'.
+    "APHRODITE_CPU_OMP_THREADS_BIND":
+    lambda: os.getenv("APHRODITE_CPU_OMP_THREADS_BIND", "all"),
+
+    # OpenVINO key-value cache space
+    # default is 4GB
+    "APHRODITE_OPENVINO_KVCACHE_SPACE":
+    lambda: int(os.getenv("APHRODITE_OPENVINO_KVCACHE_SPACE", "0")),
+
+    # OpenVINO KV cache precision
+    # default is bf16 if natively supported by platform, otherwise f16
+    # To enable KV cache compression, please, explicitly specify u8
+    "APHRODITE_OPENVINO_CPU_KV_CACHE_PRECISION":
+    lambda: os.getenv("APHRODITE_OPENVINO_CPU_KV_CACHE_PRECISION", None),
+
+    # Enables weights compression during model export via HF Optimum
+    # default is False
+    "APHRODITE_OPENVINO_ENABLE_QUANTIZED_WEIGHTS":
+    lambda: bool(os.getenv(
+        "APHRODITE_OPENVINO_ENABLE_QUANTIZED_WEIGHTS", False)),
+
+    # If the env var is set, then all workers will execute as separate
+    # processes from the engine, and we use the same mechanism to trigger
+    # execution on all workers.
+    # Run aphrodite with APHRODITE_USE_RAY_SPMD_WORKER=1 to enable it.
+    "APHRODITE_USE_RAY_SPMD_WORKER":
+    lambda: bool(int(os.getenv("APHRODITE_USE_RAY_SPMD_WORKER", "0"))),
+
+    # If the env var is set, it uses the Ray's compiled DAG API
+    # which optimizes the control plane overhead.
+    # Run aphrodite with APHRODITE_USE_RAY_COMPILED_DAG=1 to enable it.
+    "APHRODITE_USE_RAY_COMPILED_DAG":
+    lambda: bool(int(os.getenv("APHRODITE_USE_RAY_COMPILED_DAG", "0"))),
+
+    # If the env var is set, it uses NCCL for communication in
+    # Ray's compiled DAG. This flag is ignored if
+    # APHRODITE_USE_RAY_COMPILED_DAG is not set.
+    "APHRODITE_USE_RAY_COMPILED_DAG_NCCL_CHANNEL":
+    lambda: bool(int(
+        os.getenv("APHRODITE_USE_RAY_COMPILED_DAG_NCCL_CHANNEL", "1"))),
+
+    # Use dedicated multiprocess context for workers.
+    # Both spawn and fork work
+    "APHRODITE_WORKER_MULTIPROC_METHOD":
+    lambda: os.getenv("APHRODITE_WORKER_MULTIPROC_METHOD", "fork"),
+
+    # Path to the cache for storing downloaded assets
+    "APHRODITE_ASSETS_CACHE":
+    lambda: os.path.expanduser(
+        os.getenv(
+            "APHRODITE_ASSETS_CACHE",
+            os.path.join(get_default_cache_root(), "aphrodite", "assets"),
+        )),
+
+    # Timeout for fetching images when serving multimodal models
+    # Default is 5 seconds
+    "APHRODITE_IMAGE_FETCH_TIMEOUT":
+    lambda: int(os.getenv("APHRODITE_IMAGE_FETCH_TIMEOUT", "5")),
+
+    # Timeout for fetching audio when serving multimodal models
+    # Default is 5 seconds
+    "APHRODITE_AUDIO_FETCH_TIMEOUT":
+    lambda: int(os.getenv("APHRODITE_AUDIO_FETCH_TIMEOUT", "5")),
+
+    # Path to the XLA persistent cache directory.
+    # Only used for XLA devices such as TPUs.
+    "APHRODITE_XLA_CACHE_PATH":
+    lambda: os.path.expanduser(
+        os.getenv(
+            "APHRODITE_XLA_CACHE_PATH",
+            os.path.join(get_default_cache_root(), "aphrodite", "xla_cache"),
+        )),
+    "APHRODITE_FUSED_MOE_CHUNK_SIZE":
+    lambda: int(os.getenv("APHRODITE_FUSED_MOE_CHUNK_SIZE", "65536")),
+
+    # If set, aphrodite will skip the deprecation warnings.
+    "APHRODITE_NO_DEPRECATION_WARNING":
+    lambda: bool(int(os.getenv("APHRODITE_NO_DEPRECATION_WARNING", "0"))),
+
+    # If set, the OpenAI API server will stay alive even after the underlying
+    # AsyncLLMEngine errors and stops serving requests
+    "APHRODITE_KEEP_ALIVE_ON_ENGINE_DEATH":
+    lambda: bool(os.getenv("APHRODITE_KEEP_ALIVE_ON_ENGINE_DEATH", 0)),
+
+    # If the env var APHRODITE_DYNAMIC_ROPE_SCALING is set, it allows
+    # the user to specify a max sequence length greater than
+    # the max length derived from the model's config.json.
+    # To enable this, set APHRODITE_DYNAMIC_ROPE_SCALING=1.
+    "APHRODITE_DYNAMIC_ROPE_SCALING":
+    lambda:
+    (os.environ.get(
+        "APHRODITE_DYNAMIC_ROPE_SCALING",
+        "0").strip().lower() in ("1", "true")),
+
+    # If set, forces FP8 Marlin to be used for FP8 quantization regardless
+    # of the hardware support for FP8 compute.
+    "APHRODITE_TEST_FORCE_FP8_MARLIN":
+    lambda:
+    (os.environ.get("APHRODITE_TEST_FORCE_FP8_MARLIN", "0").strip().lower() in
+     ("1", "true")),
+
+    # If set, allow running the engine as a separate ray actor,
+    # which is a deprecated feature soon to be removed.
+    "APHRODITE_ALLOW_ENGINE_USE_RAY":
+    lambda:
+    (os.environ.get("APHRODITE_ALLOW_ENGINE_USE_RAY", "0").strip().lower() in
+     ("1", "true")),
+
+    # a list of plugin names to load, separated by commas.
+    # if this is not set, it means all plugins will be loaded
+    # if this is set to an empty string, no plugins will be loaded
+    "APHRODITE_PLUGINS":
+    lambda: None if "APHRODITE_PLUGINS" not in os.environ else os.environ[
+        "APHRODITE_PLUGINS"].split(","),
+}
+
+# end-env-vars-definition
+
+
+def __getattr__(name: str):
+    # lazy evaluation of environment variables
+    if name in environment_variables:
+        return environment_variables[name]()
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+
+
+def __dir__():
+    return list(environment_variables.keys())

+ 2 - 1
aphrodite/executor/cpu_executor.py

@@ -5,6 +5,7 @@ from typing import Any, Awaitable, List, Optional, Set, Tuple, Union
 import torch
 from loguru import logger
 
+from aphrodite import envs
 from aphrodite.common.config import CacheConfig, ModelConfig, SchedulerConfig
 from aphrodite.common.sequence import ExecuteModelRequest, SamplerOutput
 from aphrodite.common.utils import (GiB_bytes, get_aphrodite_instance_id,
@@ -333,7 +334,7 @@ def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig:
         logger.warning("Prefix caching is not supported on CPU, disable it.")
         config.enable_prefix_caching = False
 
-    kv_cache_space_str = os.getenv("APHRODITE_CPU_KVCACHE_SPACE", "0")
+    kv_cache_space_str = envs.APHRODITE_CPU_KVCACHE_SPACE
     kv_cache_space = int(kv_cache_space_str)
 
     if kv_cache_space >= 0:

+ 3 - 1
aphrodite/executor/multiproc_worker_utils.py

@@ -14,6 +14,8 @@ from typing import (Any, Callable, Dict, Generic, List, Optional, TextIO,
 
 from loguru import logger
 
+from aphrodite import envs
+
 T = TypeVar('T')
 
 _TERMINATE = "TERMINATE"  # sentinel
@@ -26,7 +28,7 @@ JOIN_TIMEOUT_S = 2
 
 # Use dedicated multiprocess context for workers.
 # Both spawn and fork work
-mp_method = os.getenv("APHRODITE_WORKER_MULTIPROC_METHOD", "fork")
+mp_method = envs.APHRODITE_WORKER_MULTIPROC_METHOD
 mp = multiprocessing.get_context(mp_method)
 
 

+ 4 - 5
aphrodite/executor/openvino_executor.py

@@ -1,4 +1,3 @@
-import os
 from typing import List, Set, Tuple
 
 import openvino as ov
@@ -6,6 +5,7 @@ import openvino.properties.hint as hints
 import torch
 from loguru import logger
 
+from aphrodite import envs
 from aphrodite.common.config import CacheConfig, ModelConfig
 from aphrodite.common.sequence import ExecuteModelRequest, SamplerOutput
 from aphrodite.common.utils import (GiB_bytes, get_distributed_init_method,
@@ -13,10 +13,9 @@ from aphrodite.common.utils import (GiB_bytes, get_distributed_init_method,
 from aphrodite.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from aphrodite.lora.request import LoRARequest
 
-APHRODITE_OPENVINO_KVCACHE_SPACE = int(
-    os.getenv("APHRODITE_OPENVINO_KVCACHE_SPACE", 0))
-APHRODITE_OPENVINO_CPU_KV_CACHE_PRECISION = os.getenv(
-    "APHRODITE_OPENVINO_CPU_KV_CACHE_PRECISION", None)
+APHRODITE_OPENVINO_KVCACHE_SPACE = envs.APHRODITE_OPENVINO_KVCACHE_SPACE
+APHRODITE_OPENVINO_CPU_KV_CACHE_PRECISION = (
+    envs.APHRODITE_OPENVINO_CPU_KV_CACHE_PRECISION)
 
 
 class OpenVINOExecutor(ExecutorBase):

+ 7 - 8
aphrodite/executor/ray_gpu_executor.py

@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
 import msgspec
 from loguru import logger
 
+from aphrodite import envs
 from aphrodite.common.sequence import ExecuteModelRequest, SamplerOutput
 from aphrodite.common.utils import (_run_task_with_lock,
                                     get_aphrodite_instance_id,
@@ -26,14 +27,12 @@ if TYPE_CHECKING:
 # If the env var is set, it uses the Ray's compiled DAG API
 # which optimizes the control plane overhead.
 # Run Aphrodite with APHRODITE_USE_RAY_COMPILED_DAG=1 to enable it.
-APHRODITE_USE_RAY_COMPILED_DAG = bool(
-    os.getenv("APHRODITE_USE_RAY_COMPILED_DAG", 0))
-APHRODITE_TRACE_FUNCTION = int(os.getenv("APHRODITE_TRACE_FUNCTION", 0))
-APHRODITE_USE_RAY_SPMD_WORKER = bool(
-    os.getenv("APHRODITE_USE_RAY_SPMD_WORKER", 0))
-
-APHRODITE_USE_RAY_COMPILED_DAG_NCCL_CHANNEL = bool(
-    int(os.getenv("APHRODITE_USE_RAY_COMPILED_DAG_NCCL_CHANNEL", 1)))
+APHRODITE_USE_RAY_COMPILED_DAG = envs.APHRODITE_USE_RAY_COMPILED_DAG
+APHRODITE_TRACE_FUNCTION = envs.APHRODITE_TRACE_FUNCTION
+APHRODITE_USE_RAY_SPMD_WORKER = envs.APHRODITE_USE_RAY_SPMD_WORKER
+
+APHRODITE_USE_RAY_COMPILED_DAG_NCCL_CHANNEL = (
+    envs.APHRODITE_USE_RAY_COMPILED_DAG_NCCL_CHANNEL)
 
 
 class RayGPUExecutor(DistributedGPUExecutor):

+ 2 - 1
aphrodite/executor/ray_tpu_executor.py

@@ -7,6 +7,7 @@ from typing import (TYPE_CHECKING, Any, Awaitable, Dict, List, Optional, Tuple,
 
 from loguru import logger
 
+from aphrodite import envs
 from aphrodite.common.sequence import ExecuteModelRequest, SamplerOutput
 from aphrodite.common.utils import (get_aphrodite_instance_id,
                                     get_distributed_init_method, get_ip,
@@ -21,7 +22,7 @@ if ray is not None:
 if TYPE_CHECKING:
     from ray.util.placement_group import PlacementGroup
 
-APHRODITE_TRACE_FUNCTION = int(os.getenv("APHRODITE_TRACE_FUNCTION", 0))
+APHRODITE_TRACE_FUNCTION = envs.APHRODITE_TRACE_FUNCTION
 
 
 class RayTPUExecutor(TPUExecutor):

+ 2 - 1
aphrodite/executor/ray_xpu_executor.py

@@ -7,6 +7,7 @@ from typing import (TYPE_CHECKING, Any, Awaitable, Dict, List, Optional, Set,
 
 from loguru import logger
 
+from aphrodite import envs
 from aphrodite.common.config import (CacheConfig, DeviceConfig, LoadConfig,
                                      LoRAConfig, ModelConfig, ParallelConfig,
                                      PromptAdapterConfig, SchedulerConfig,
@@ -28,7 +29,7 @@ if TYPE_CHECKING:
 # If the env var is set, it uses the Ray's compiled DAG API
 # which optimizes the control plane overhead.
 # Run Aphrodite with APHRODITE_USE_RAY_COMPILED_DAG=1 to enable it.
-USE_RAY_COMPILED_DAG = bool(os.getenv("APHRODITE_USE_RAY_COMPILED_DAG", 0))
+USE_RAY_COMPILED_DAG = envs.APHRODITE_USE_RAY_COMPILED_DAG
 
 
 class RayXPUExecutor(DistributedGPUExecutor):

+ 2 - 2
aphrodite/modeling/layers/fused_moe/fused_moe.py

@@ -10,10 +10,10 @@ import triton.language as tl
 from loguru import logger
 
 from aphrodite import _custom_ops as ops
+from aphrodite import envs
 from aphrodite.platforms import current_platform
 
-APHRODITE_FUSED_MOE_CHUNK_SIZE = int(
-    os.getenv("APHRODITE_FUSED_MOE_CHUNK_SIZE", "65536"))
+APHRODITE_FUSED_MOE_CHUNK_SIZE = envs.APHRODITE_FUSED_MOE_CHUNK_SIZE
 
 
 @triton.jit

+ 2 - 3
aphrodite/modeling/layers/sampler.py

@@ -1,6 +1,5 @@
 """A layer that samples the next tokens from the model's outputs."""
 import itertools
-import os
 import warnings
 from enum import IntEnum
 from math import inf
@@ -11,6 +10,7 @@ import torch.nn as nn
 from loguru import logger
 
 import aphrodite._custom_ops as ops
+from aphrodite import envs
 from aphrodite.common.sampling_params import SamplingType
 from aphrodite.common.sequence import (CompletionSequenceGroupOutput, Logprob,
                                        PromptLogprobs, SampleLogprobs,
@@ -34,8 +34,7 @@ _TEMPERATURE_MINIMUM = 2e-5
 
 # If enabled, we switch to a more performant implementation
 # of top-k and top-p
-APHRODITE_USE_SAMPLING_KERNELS = bool(int(
-    os.getenv("APHRODITE_USE_SAMPLING_KERNELS", "0")))
+APHRODITE_USE_SAMPLING_KERNELS = envs.APHRODITE_USE_SAMPLING_KERNELS
 
 
 class SamplerID(IntEnum):

+ 3 - 3
aphrodite/modeling/model_loader/openvino.py

@@ -1,5 +1,4 @@
 # ruff: noqa: SIM117
-import os
 from pathlib import Path
 from typing import List, Optional, Tuple
 
@@ -11,6 +10,7 @@ from openvino._offline_transformations import paged_attention_transformation
 from optimum.intel import OVModelForCausalLM
 from torch import nn
 
+from aphrodite import envs
 from aphrodite.attention.backends.openvino import OpenVINOAttentionMetadata
 from aphrodite.common.config import DeviceConfig, ModelConfig
 from aphrodite.common.sequence import SamplerOutput
@@ -19,8 +19,8 @@ from aphrodite.modeling.layers.logits_processor import (LogitsProcessor,
 from aphrodite.modeling.layers.sampler import Sampler
 from aphrodite.modeling.sampling_metadata import SamplingMetadata
 
-APHRODITE_OPENVINO_ENABLE_QUANTIZED_WEIGHTS = bool(
-    os.getenv("APHRODITE_OPENVINO_ENABLE_QUANTIZED_WEIGHTS", False))
+APHRODITE_OPENVINO_ENABLE_QUANTIZED_WEIGHTS = (
+    envs.APHRODITE_OPENVINO_ENABLE_QUANTIZED_WEIGHTS)
 
 
 def _flattenize_inputs(inputs):

+ 4 - 3
aphrodite/modeling/model_loader/tensorizer.py

@@ -13,6 +13,7 @@ from loguru import logger
 from torch import nn
 from transformers import PretrainedConfig
 
+from aphrodite import envs
 from aphrodite.common.config import ModelConfig, ParallelConfig
 from aphrodite.engine.aphrodite_engine import AphroditeEngine
 from aphrodite.engine.args_tools import EngineArgs
@@ -148,12 +149,12 @@ class TensorizerArgs:
     def __post_init__(self):
         self.file_obj = self.tensorizer_uri
         self.s3_access_key_id = (self.s3_access_key_id
-                                 or os.environ.get("S3_ACCESS_KEY_ID")) or None
+                                 or envs.S3_ACCESS_KEY_ID) or None
         self.s3_secret_access_key = (
             self.s3_secret_access_key
-            or os.environ.get("S3_SECRET_ACCESS_KEY")) or None
+            or envs.S3_SECRET_ACCESS_KEY) or None
         self.s3_endpoint = (self.s3_endpoint
-                            or os.environ.get("S3_ENDPOINT_URL")) or None
+                            or envs.S3_ENDPOINT_URL) or None
         self.stream_params = {
             "s3_access_key_id": self.s3_access_key_id,
             "s3_secret_access_key": self.s3_secret_access_key,

+ 3 - 5
aphrodite/multimodal/utils.py

@@ -1,5 +1,4 @@
 import base64
-import os
 from io import BytesIO
 from typing import Tuple, Union
 
@@ -8,14 +7,13 @@ import numpy as np
 import soundfile
 from PIL import Image
 
+from aphrodite import envs
 from aphrodite.common.connections import global_http_connection
 from aphrodite.multimodal.base import MultiModalDataDict
 
-APHRODITE_IMAGE_FETCH_TIMEOUT = int(
-    os.getenv("APHRODITE_IMAGE_FETCH_TIMEOUT", 10))
+APHRODITE_IMAGE_FETCH_TIMEOUT = envs.APHRODITE_IMAGE_FETCH_TIMEOUT
 
-APHRODITE_AUDIO_FETCH_TIMEOUT = int(
-    os.getenv("APHRODITE_AUDIO_FETCH_TIMEOUT", 10))
+APHRODITE_AUDIO_FETCH_TIMEOUT = envs.APHRODITE_AUDIO_FETCH_TIMEOUT
 
 
 def _load_image_from_bytes(b: bytes):

+ 3 - 5
aphrodite/plugins/__init__.py

@@ -1,9 +1,7 @@
-import os
-
 from loguru import logger
 
-APHRODITE_PLUGINS = None if "APHRODITE_PLUGINS" not in os.environ else \
-    os.environ["APHRODITE_PLUGINS"].split(",")
+from aphrodite import envs
+
 
 def load_general_plugins():
     """WARNING: plugins can be loaded for multiple times in different
@@ -16,7 +14,7 @@ def load_general_plugins():
     else:
         from importlib.metadata import entry_points
 
-    allowed_plugins = APHRODITE_PLUGINS
+    allowed_plugins = envs.APHRODITE_PLUGINS
 
     discovered_plugins = entry_points(group='aphrodite.general_plugins')
     for plugin in discovered_plugins:

+ 2 - 3
aphrodite/quantization/fp8.py

@@ -1,4 +1,3 @@
-import os
 from typing import Any, Dict, List, Optional
 
 import torch
@@ -7,6 +6,7 @@ from torch.nn import Module
 from torch.nn.parameter import Parameter
 
 from aphrodite import _custom_ops as ops
+from aphrodite import envs
 from aphrodite.common.utils import is_hip, print_warning_once
 from aphrodite.modeling.layers.fused_moe import FusedMoE, FusedMoEMethodBase
 from aphrodite.modeling.layers.linear import (LinearBase, LinearMethodBase,
@@ -26,8 +26,7 @@ from aphrodite.quantization.utils.w8a8_utils import (
     requantize_with_max_scale)
 
 ACTIVATION_SCHEMES = ["static", "dynamic"]
-APHRODITE_TEST_FORCE_FP8_MARLIN = os.environ.get(
-    "APHRODITE_TEST_FORCE_FP8_MARLIN", "0").strip().lower() in ("1", "true")
+APHRODITE_TEST_FORCE_FP8_MARLIN = envs.APHRODITE_TEST_FORCE_FP8_MARLIN
 
 
 class Fp8Config(QuantizationConfig):

+ 3 - 3
aphrodite/server/launch.py

@@ -1,5 +1,4 @@
 import asyncio
-import os
 import signal
 from http import HTTPStatus
 from typing import Any
@@ -8,12 +7,13 @@ import uvicorn
 from fastapi import FastAPI, Response
 from loguru import logger
 
+from aphrodite import envs
 from aphrodite.common.utils import find_process_using_port, in_windows
 from aphrodite.engine.async_aphrodite import AsyncEngineDeadError
 from aphrodite.engine.protocol import AsyncEngineClient
 
-APHRODITE_KEEP_ALIVE_ON_ENGINE_DEATH = bool(os.getenv(
-    "APHRODITE_KEEP_ALIVE_ON_ENGINE_DEATH", 0))
+APHRODITE_KEEP_ALIVE_ON_ENGINE_DEATH = (
+    envs.APHRODITE_KEEP_ALIVE_ON_ENGINE_DEATH)
 
 
 async def serve_http(app: FastAPI, engine: AsyncEngineClient,

+ 2 - 3
aphrodite/task_handler/cpu_worker.py

@@ -1,10 +1,10 @@
 """A CPU worker class."""
-import os
 from typing import Dict, List, Optional, Tuple
 
 import torch
 import torch.distributed
 
+from aphrodite import envs
 from aphrodite.attention import get_attn_backend
 from aphrodite.common.config import (CacheConfig, DeviceConfig, LoadConfig,
                                      LoRAConfig, ModelConfig, ParallelConfig,
@@ -19,8 +19,7 @@ from aphrodite.task_handler.worker_base import (LocalOrDistributedWorkerBase,
                                                 LoraNotSupportedWorkerBase,
                                                 WorkerInput)
 
-APHRODITE_CPU_OMP_THREADS_BIND = os.getenv("APHRODITE_CPU_OMP_THREADS_BIND",
-                                           "all")
+APHRODITE_CPU_OMP_THREADS_BIND = envs.APHRODITE_CPU_OMP_THREADS_BIND
 
 
 class CPUCacheEngine:

+ 2 - 2
aphrodite/task_handler/tpu_worker.py

@@ -5,6 +5,7 @@ import torch
 import torch_xla.core.xla_model as xm
 import torch_xla.runtime as xr
 
+from aphrodite import envs
 from aphrodite.common.config import (CacheConfig, DeviceConfig, LoadConfig,
                                      ModelConfig, ParallelConfig,
                                      SchedulerConfig)
@@ -99,8 +100,7 @@ class TPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
         # Use persistent cache to avoid XLA recompilation.
         # NOTE: Set per-rank cache path since different ranks
         # can have slightly different XLA graphs.
-        APHRODITE_XLA_CACHE_PATH = os.getenv("APHRODITE_XLA_CACHE_PATH",
-                                             "~/.aphrodite/xla_cache/")
+        APHRODITE_XLA_CACHE_PATH = envs.APHRODITE_XLA_CACHE_PATH
         world_size = self.parallel_config.world_size
         per_rank_path = os.path.join(APHRODITE_XLA_CACHE_PATH,
                                      f"tp{world_size}_rank{self.rank}")

+ 2 - 2
aphrodite/transformers_utils/config.py

@@ -1,7 +1,6 @@
 import contextlib
 import enum
 import json
-import os
 from pathlib import Path
 from typing import Any, Dict, Optional, Type, Union
 
@@ -14,6 +13,7 @@ from transformers.models.auto.modeling_auto import (
     MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)
 from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME
 
+from aphrodite import envs
 from aphrodite.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
                                                   InternVLChatConfig,
                                                   JAISConfig, MedusaConfig,
@@ -21,7 +21,7 @@ from aphrodite.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
                                                   MPTConfig, RWConfig)
 from aphrodite.transformers_utils.utils import check_gguf_file
 
-APHRODITE_USE_MODELSCOPE = os.getenv("APHRODITE_USE_MODELSCOPE", "0") == "1"
+APHRODITE_USE_MODELSCOPE = envs.APHRODITE_USE_MODELSCOPE
 
 if APHRODITE_USE_MODELSCOPE:
     from modelscope import AutoConfig

+ 4 - 5
examples/tensorize_aphrodite_model.py

@@ -1,10 +1,9 @@
 import argparse
 import dataclasses
 import json
-import os
 import uuid
 
-from aphrodite import LLM
+from aphrodite import LLM, envs
 from aphrodite.engine.args_tools import EngineArgs
 from aphrodite.modeling.model_loader.tensorizer import (
     TensorizerArgs, TensorizerConfig, tensorize_aphrodite_model)
@@ -177,11 +176,11 @@ if __name__ == '__main__':
     args = parse_args()
 
     s3_access_key_id = (getattr(args, 's3_access_key_id', None)
-                        or os.environ.get("S3_ACCESS_KEY_ID", None))
+                        or envs.S3_ACCESS_KEY_ID)
     s3_secret_access_key = (getattr(args, 's3_secret_access_key', None)
-                            or os.environ.get("S3_SECRET_ACCESS_KEY", None))
+                            or envs.S3_SECRET_ACCESS_KEY)
     s3_endpoint = (getattr(args, 's3_endpoint', None)
-                or os.environ.get("S3_ENDPOINT_URL", None))
+                or envs.S3_ENDPOINT_URL)
 
     credentials = {
         "s3_access_key_id": s3_access_key_id,

+ 22 - 7
setup.py

@@ -1,3 +1,4 @@
+import importlib.util
 import io
 import logging
 import os
@@ -14,10 +15,16 @@ from setuptools import Extension, find_packages, setup
 from setuptools.command.build_ext import build_ext
 from torch.utils.cpp_extension import CUDA_HOME
 
+
+def load_module_from_path(module_name, path):
+    spec = importlib.util.spec_from_file_location(module_name, path)
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)
+    return module
+
 ROOT_DIR = os.path.dirname(__file__)
 logger = logging.getLogger(__name__)
-# Target device of Aphrodite, supporting [cuda (by default), rocm, neuron, cpu]
-APHRODITE_TARGET_DEVICE = os.getenv("APHRODITE_TARGET_DEVICE", "cuda")
 
 
 def embed_commit_hash():
@@ -47,6 +54,14 @@ def embed_commit_hash():
 
 embed_commit_hash()
 
+
+# cannot import envs directly because it depends on aphrodite,
+#  which is not installed yet
+envs = load_module_from_path('envs', os.path.join(
+    ROOT_DIR, 'aphrodite', 'envs.py'))
+
+APHRODITE_TARGET_DEVICE = envs.APHRODITE_TARGET_DEVICE
+
 if not sys.platform.startswith("linux"):
     logger.warning(
         "Aphrodite only supports Linux platform (including WSL). "
@@ -97,7 +112,7 @@ class cmake_build_ext(build_ext):
     def compute_num_jobs(self):
         # `num_jobs` is either the value of the MAX_JOBS environment variable
         # (if defined) or the number of CPUs available.
-        num_jobs = os.environ.get("MAX_JOBS", None)
+        num_jobs = envs.MAX_JOBS
         if num_jobs is not None:
             num_jobs = int(num_jobs)
             logger.info(f"Using MAX_JOBS={num_jobs} as the number of jobs.")
@@ -118,7 +133,7 @@ class cmake_build_ext(build_ext):
             # environment variable (if defined) or 1.
             # when it is set, we reduce `num_jobs` to avoid
             # overloading the system.
-            nvcc_threads = os.getenv("NVCC_THREADS", None)
+            nvcc_threads = envs.NVCC_THREADS
             if nvcc_threads is not None:
                 nvcc_threads = int(nvcc_threads)
                 logger.info(f"Using NVCC_THREADS={nvcc_threads} as the number"
@@ -143,7 +158,7 @@ class cmake_build_ext(build_ext):
         # Select the build type.
         # Note: optimization level + debug info are set by the build type
         default_cfg = "Debug" if self.debug else "RelWithDebInfo"
-        cfg = os.getenv("CMAKE_BUILD_TYPE", default_cfg)
+        cfg = envs.CMAKE_BUILD_TYPE or default_cfg
 
         # where .so files will be written, should be the same for all extensions
         # that use the same CMakeLists.txt.
@@ -161,7 +176,7 @@ class cmake_build_ext(build_ext):
             '-DAPHRODITE_TARGET_DEVICE={}'.format(APHRODITE_TARGET_DEVICE),
         ]
 
-        verbose = bool(int(os.getenv('VERBOSE', '0')))
+        verbose = envs.VERBOSE
         if verbose:
             cmake_args += ['-DCMAKE_VERBOSE_MAKEFILE=ON']
 
@@ -469,7 +484,7 @@ package_data = {
         "py.typed", "modeling/layers/fused_moe/configs/*.json"
     ]
 }
-if os.environ.get("APHRODITE_USE_PRECOMPILED"):
+if envs.APHRODITE_USE_PRECOMPILED:
     ext_modules = []
     package_data["aphrodite"].append("*.so")