3 달 전 · 901900854e
--- a/aphrodite/assets/base.py
+++ b/aphrodite/assets/base.py
@@ -5,6 +5,7 @@ from functools import lru_cache
 
															 from pathlib import Path
														
 
															 from typing import Optional
														
 
															+from aphrodite import envs
														
 
															 from aphrodite.connections import global_http_connection
														
@@ -15,13 +16,8 @@ def get_default_cache_root():
 
															     )
														
 
															 vLLM_S3_BUCKET_URL = "https://vllm-public-assets.s3.us-west-2.amazonaws.com"
														
 
															-APHRODITE_ASSETS_CACHE = os.path.expanduser(
														
 
															-    os.getenv(
														
 
															-        "APHRODITE_ASSETS_CACHE",
														
 
															-        os.path.join(get_default_cache_root(), "aphrodite", "assets"),
														
 
															-    ))
														
 
															-APHRODITE_IMAGE_FETCH_TIMEOUT = int(os.getenv("APHRODITE_IMAGE_FETCH_TIMEOUT",
														
 
															-                                              5))
														
 
															+APHRODITE_ASSETS_CACHE = envs.APHRODITE_ASSETS_CACHE
														
 
															+APHRODITE_IMAGE_FETCH_TIMEOUT = envs.APHRODITE_IMAGE_FETCH_TIMEOUT
														
 
															 def get_cache_dir() -> Path:
														
 
															     """Get the path to the cache for storing downloaded assets."""
														
--- a/aphrodite/attention/backends/rocm_flash_attn.py
+++ b/aphrodite/attention/backends/rocm_flash_attn.py
@@ -1,11 +1,11 @@
 
															 """Attention layer ROCm GPUs."""
														
 
															-import os
														
 
															 from dataclasses import dataclass
														
 
															 from typing import Any, Dict, List, Optional, Tuple, Type
														
 
															 import torch
														
 
															 from loguru import logger
														
 
															+from aphrodite import envs
														
 
															 from aphrodite.attention.backends.abstract import (AttentionBackend,
														
 
															                                                    AttentionImpl,
														
 
															                                                    AttentionMetadata,
														
@@ -280,9 +280,7 @@ class ROCmFlashAttentionImpl(AttentionImpl):
 
															         self.use_naive_attn = False
														
 
															         # NOTE: Allow for switching between Triton and CK. Defaulting to triton.
														
 
															-        self.use_triton_flash_attn = (os.environ.get(
														
 
															-            "APHRODITE_USE_TRITON_FLASH_ATTN", "True").lower()
														
 
															-                                      in ("true", "1"))
														
 
															+        self.use_triton_flash_attn = envs.APHRODITE_USE_TRITON_FLASH_ATTN
														
 
															         if self.use_triton_flash_attn:
														
 
															             from aphrodite.attention.ops.triton_flash_attn import (  # noqa: F401
														
 
															                 triton_attention)
														
--- a/aphrodite/attention/selector.py
+++ b/aphrodite/attention/selector.py
@@ -7,12 +7,13 @@ from typing import Generator, Optional, Type
 
															 import torch
														
 
															 from loguru import logger
														
 
															+from aphrodite import envs
														
 
															 from aphrodite.attention.backends.abstract import AttentionBackend
														
 
															 from aphrodite.common.utils import (STR_BACKEND_ENV_VAR, is_cpu, is_hip,
														
 
															                                     is_openvino, is_xpu)
														
 
															 from aphrodite.platforms import current_platform
														
 
															-APHRODITE_ATTENTION_BACKEND = os.getenv("APHRODITE_ATTENTION_BACKEND", None)
														
 
															+APHRODITE_ATTENTION_BACKEND = envs.APHRODITE_ATTENTION_BACKEND
														
 
															 class _Backend(enum.Enum):
														
--- a/aphrodite/common/config.py
+++ b/aphrodite/common/config.py
@@ -9,6 +9,7 @@ import torch
 
															 from loguru import logger
														
 
															 from transformers import PretrainedConfig
														
 
															+from aphrodite import envs
														
 
															 from aphrodite.common.utils import (STR_NOT_IMPL_ENC_DEC_CUDAGRAPH, GiB_bytes,
														
 
															                                     cuda_device_count_stateless,
														
 
															                                     get_cpu_memory, is_cpu, is_hip, is_neuron,
														
@@ -30,8 +31,7 @@ if TYPE_CHECKING:
 
															         BaseTokenizerGroup)
														
 
															 # If true, will load models from ModelScope instead of Hugging Face Hub.
														
 
															-APHRODITE_USE_MODELSCOPE = os.environ.get("APHRODITE_USE_MODELSCOPE",
														
 
															-                                          "False").lower() == "true"
														
 
															+APHRODITE_USE_MODELSCOPE = envs.APHRODITE_USE_MODELSCOPE
														
 
															 _EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768
														
@@ -1820,21 +1820,39 @@ def _get_and_verify_max_len(
 
															                     "original_max_position_embeddings"]
														
 
															             derived_max_model_len *= scaling_factor
														
 
															+    # If the user specified a max length, make sure it is smaller than the
														
 
															+    # derived length from the HF model config.
														
 
															     if max_model_len is None:
														
 
															-        max_model_len = derived_max_model_len
														
 
															-    elif max_model_len > derived_max_model_len and rope_scaling_arg is None:
														
 
															-        raise ValueError(
														
 
															-            f"User-specified max_model_len {max_model_len} is higher than "
														
 
															-            f"the original {derived_max_model_len}. "
														
 
															-            "Please provide a rope_scaling dict to scale the model.")
														
 
															-    elif max_model_len > derived_max_model_len and rope_scaling_arg is not None:
														
 
															-        # hope this works
														
 
															-        logger.warning(
														
 
															-            f"User-specified max_model_len {max_model_len} is higher than "
														
 
															-            f"the original {derived_max_model_len}. "
														
 
															-            "Attempting to use RoPE scaling with the provided rope_scaling "
														
 
															-            "dict.")
														
 
															-        derived_max_model_len = max_model_len
														
 
															+        max_model_len = int(derived_max_model_len)
														
 
															+    elif max_model_len > derived_max_model_len:
														
 
															+        # Some models might have a separate key for specifying model_max_length
														
 
															+        # that will be bigger than derived_max_model_len. We compare user input
														
 
															+        # with model_max_length and allow this override when it's smaller.
														
 
															+        model_max_length = getattr(hf_config, "model_max_length", None)
														
 
															+        if envs.APHRODITE_DYNAMIC_ROPE_SCALING:
														
 
															+            scaling_factor = max_model_len / derived_max_model_len
														
 
															+            hf_config.rope_scaling = {"factor": scaling_factor,
														
 
															+                                      "type": "dynamic"}
														
 
															+            logger.info(
														
 
															+                "Using dynamic RoPE scaling to extend the model's max context "
														
 
															+                f"length from {derived_max_model_len} to {max_model_len}.")
														
 
															+            derived_max_model_len = max_model_len
														
 
															+        elif model_max_length is not None and max_model_len <= model_max_length:
														
 
															+            if disable_sliding_window:
														
 
															+                # TODO: Find a model that has model_max_length
														
 
															+                # with sliding window to see if this case should be allowed.
														
 
															+                raise NotImplementedError(
														
 
															+                    "Disabling sliding window is not supported for models "
														
 
															+                    "model_max_length in the config. Please raise an issue "
														
 
															+                    "so we can investigate.")
														
 
															+        else:
														
 
															+            raise ValueError(
														
 
															+                f"User-specified max_model_len ({max_model_len}) is greater "
														
 
															+                f"than the derived max_model_len ({max_len_key}="
														
 
															+                f"{derived_max_model_len} or model_max_length="
														
 
															+                f"{model_max_length} in model's config.json). To allow "
														
 
															+                "greater lengths, please set the env var "
														
 
															+                "APHRODITE_DYNAMIC_ROPE_SCALING=1")
														
 
															     return int(max_model_len)
														
--- a/aphrodite/common/logger.py
+++ b/aphrodite/common/logger.py
@@ -15,11 +15,12 @@ from rich.markup import escape
 
															 from rich.progress import (BarColumn, MofNCompleteColumn, Progress,
														
 
															                            TaskProgressColumn, TextColumn, TimeRemainingColumn)
														
 
															+from aphrodite import envs
														
 
															+
														
 
															 RICH_CONSOLE = Console()
														
 
															 LOG_LEVEL = os.getenv("APHRODITE_LOG_LEVEL", "INFO").upper()
														
 
															-APHRODITE_CONFIGURE_LOGGING = int(os.getenv("APHRODITE_CONFIGURE_LOGGING",
														
 
															-                                            "1"))
														
 
															+APHRODITE_CONFIGURE_LOGGING = envs.APHRODITE_CONFIGURE_LOGGING
														
 
															 def unwrap(wrapped, default=None):
														
--- a/aphrodite/common/sampling_params.py
+++ b/aphrodite/common/sampling_params.py
@@ -1,6 +1,5 @@
 
															 """Sampling parameters for text generation."""
														
 
															 import copy
														
 
															-import os
														
 
															 from enum import IntEnum
														
 
															 from functools import cached_property
														
 
															 from typing import Any, Callable, Dict, List, Optional, Set, Union
														
@@ -10,11 +9,12 @@ import torch
 
															 from loguru import logger
														
 
															 from typing_extensions import Annotated
														
 
															+from aphrodite import envs
														
 
															+
														
 
															 _SAMPLING_EPS = 1e-5
														
 
															 _MAX_TEMP = 1e-2
														
 
															-APHRODITE_NO_DEPRECATION_WARNING = bool(
														
 
															-    int(os.environ.get("APHRODITE_NO_DEPRECATION_WARNING", "0")))
														
 
															+APHRODITE_NO_DEPRECATION_WARNING = envs.APHRODITE_NO_DEPRECATION_WARNING
														
 
															 class SamplingType(IntEnum):
														
--- a/aphrodite/common/utils.py
+++ b/aphrodite/common/utils.py
@@ -31,6 +31,7 @@ from rich.progress import (BarColumn, MofNCompleteColumn, Progress,
 
															                            SpinnerColumn, TextColumn, TimeElapsedColumn)
														
 
															 from typing_extensions import ParamSpec, TypeIs, assert_never
														
 
															+from aphrodite import envs
														
 
															 from aphrodite.common.logger import enable_trace_function_call
														
 
															 from aphrodite.distributed import get_tensor_model_parallel_rank
														
@@ -382,8 +383,7 @@ def get_aphrodite_instance_id():
 
															     Instance id represents an instance of the Aphrodite. All processes in the
														
 
															     same instance should have the same instance id.
														
 
															     """
														
 
															-    return os.environ.get("APHRODITE_INSTANCE_ID",
														
 
															-                          f"aphrodite-instance-{random_uuid()}")
														
 
															+    return envs.APHRODITE_INSTANCE_ID or f"aphrodite-instance-{random_uuid()}"
														
 
															 @lru_cache(maxsize=None)
														
@@ -520,9 +520,7 @@ def get_distributed_init_method(ip: str, port: int) -> str:
 
															 def get_open_zmq_ipc_path() -> str:
														
 
															     if not in_windows():
														
 
															-        APHRODITE_RPC_BASE_PATH = os.getenv("APHRODITE_RPC_BASE_PATH",
														
 
															-                                        tempfile.gettempdir())
														
 
															-        base_rpc_path = APHRODITE_RPC_BASE_PATH
														
 
															+        base_rpc_path = envs.APHRODITE_RPC_BASE_PATH
														
 
															         return f"ipc://{base_rpc_path}/{uuid4()}"
														
 
															     else:
														
 
															         # windows doesn't support ipc://
														
@@ -530,8 +528,7 @@ def get_open_zmq_ipc_path() -> str:
 
															         return f"tcp://127.0.0.1:{get_open_port()}"
														
 
															 def get_open_port(port: Optional[int] = None) -> int:
														
 
															-    port = int(os.getenv("APHRODITE_PORT", 0)
														
 
															-                ) if "APHRODITE_PORT" in os.environ else None
														
 
															+    port = envs.APHRODITE_PORT
														
 
															     if port is not None:
														
 
															         while True:
														
 
															             try:
														
@@ -948,7 +945,7 @@ def find_library(lib_name: str) -> str:
 
															     # libcuda.so.1 (libc6,x86-64) => /lib/x86_64-linux-gnu/libcuda.so.1
														
 
															     locs = [line.split()[-1] for line in libs.splitlines() if lib_name in line]
														
 
															     # `LD_LIBRARY_PATH` searches the library in the user-defined paths
														
 
															-    env_ld_library_path = os.getenv("LD_LIBRARY_PATH")
														
 
															+    env_ld_library_path = envs.LD_LIBRARY_PATH
														
 
															     if not locs and env_ld_library_path:
														
 
															         locs = [
														
 
															             os.path.join(dir, lib_name)
														
@@ -967,7 +964,7 @@ def find_nccl_library() -> str:
 
															     After importing `torch`, `libnccl.so.2` or `librccl.so.1` can be
														
 
															     found by `ctypes` automatically.
														
 
															     """
														
 
															-    so_file = os.environ.get("APHRODITE_NCCL_SO_PATH", "")
														
 
															+    so_file = envs.APHRODITE_NCCL_SO_PATH
														
 
															     # manually load the nccl library
														
 
															     if so_file:
														
@@ -985,7 +982,7 @@ def find_nccl_library() -> str:
 
															 def enable_trace_function_call_for_thread() -> None:
														
 
															-    if int(os.getenv("APHRODITE_TRACE_FUNCTION", "0")):
														
 
															+    if envs.APHRODITE_TRACE_FUNCTION:
														
 
															         tmp_dir = tempfile.gettempdir()
														
 
															         filename = (f"APHRODITE_TRACE_FUNCTION_for_process_{os.getpid()}"
														
 
															                     f"_thread_{threading.get_ident()}_"
														
@@ -1074,7 +1071,7 @@ def cuda_device_count_stateless() -> int:
 
															     # This can be removed and simply replaced with torch.cuda.get_device_count
														
 
															     # after https://github.com/pytorch/pytorch/pull/122815 is released.
														
 
															-    return _cuda_device_count_stateless(os.environ.get("CUDA_VISIBLE_DEVICES"))
														
 
															+    return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES)
														
 
															 #From: https://stackoverflow.com/a/4104188/2749989
														
--- a/aphrodite/distributed/device_communicators/custom_all_reduce.py
+++ b/aphrodite/distributed/device_communicators/custom_all_reduce.py
@@ -1,4 +1,3 @@
 
															-import os
														
 
															 from contextlib import contextmanager
														
 
															 from typing import Any, List, Optional, Union
														
@@ -8,6 +7,7 @@ from loguru import logger
 
															 from torch.distributed import ProcessGroup
														
 
															 from aphrodite import _custom_ops as ops
														
 
															+from aphrodite import envs
														
 
															 from aphrodite.common.utils import cuda_device_count_stateless
														
 
															 from aphrodite.distributed.device_communicators.custom_all_reduce_utils import (
														
 
															     gpu_p2p_access_check)
														
@@ -95,7 +95,7 @@ class CustomAllreduce:
 
															         assert isinstance(device, torch.device)
														
 
															         self.device = device
														
 
															-        cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
														
 
															+        cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
														
 
															         if cuda_visible_devices:
														
 
															             device_ids = list(map(int, cuda_visible_devices.split(",")))
														
 
															         else:
														
--- a/aphrodite/distributed/device_communicators/custom_all_reduce_utils.py
+++ b/aphrodite/distributed/device_communicators/custom_all_reduce_utils.py
@@ -11,6 +11,7 @@ import torch.distributed as dist
 
															 import torch.multiprocessing as mp
														
 
															 from loguru import logger
														
 
															+from aphrodite import envs
														
 
															 from aphrodite.common.utils import (cuda_device_count_stateless,
														
 
															                                     update_environment_variables)
														
 
															 from aphrodite.distributed.device_communicators.cuda_wrapper import (
														
@@ -124,7 +125,7 @@ def can_actually_p2p(
 
															     processes for testing all pairs of GPUs in batch. The trick is to reset
														
 
															     the device after each test (which is not available in PyTorch).
														
 
															     """  # noqa
														
 
															-    cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None)
														
 
															+    cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
														
 
															     # pass the CUDA_VISIBLE_DEVICES to the child process
														
 
															     # to make sure they see the same set of GPUs
														
@@ -183,13 +184,13 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
 
															     is_distributed = dist.is_initialized()
														
 
															     num_dev = cuda_device_count_stateless()
														
 
															-    cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None)
														
 
															+    cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
														
 
															     if cuda_visible_devices is None:
														
 
															         cuda_visible_devices = ",".join(str(i) for i in range(num_dev))
														
 
															-    APHRODITE_CONFIG_ROOT = os.getenv("APHRODITE_CONFIG_ROOT", "~/.config")
														
 
															-    path = os.path.expanduser(
														
 
															-        f"{APHRODITE_CONFIG_ROOT}/aphrodite/gpu_p2p_access_cache_for_{cuda_visible_devices}.json"
														
 
															-    )
														
 
															+
														
 
															+    path = os.path.join(
														
 
															+        envs.APHRODITE_CACHE_ROOT,
														
 
															+        f"gpu_p2p_access_cache_for_{cuda_visible_devices}.json")
														
 
															     os.makedirs(os.path.dirname(path), exist_ok=True)
														
 
															     from aphrodite.distributed.parallel_state import get_world_group
														
 
															     if ((not is_distributed or get_world_group().local_rank == 0)
														
--- a/aphrodite/distributed/device_communicators/shm_broadcast.py
+++ b/aphrodite/distributed/device_communicators/shm_broadcast.py
@@ -1,4 +1,3 @@
 
															-import os
														
 
															 import pickle
														
 
															 import time
														
 
															 from contextlib import contextmanager
														
@@ -13,10 +12,11 @@ from loguru import logger
 
															 from torch.distributed import ProcessGroup
														
 
															 from zmq import SUB, SUBSCRIBE, XPUB, XPUB_VERBOSE, Context  # type: ignore
														
 
															+from aphrodite import envs
														
 
															 from aphrodite.common.utils import get_ip, get_open_port
														
 
															-APHRODITE_RINGBUFFER_WARNING_INTERVAL = os.getenv(
														
 
															-    "APHRODITE_RINGBUFFER_WARNING_INTERVAL", 60)
														
 
															+APHRODITE_RINGBUFFER_WARNING_INTERVAL = (
														
 
															+    envs.APHRODITE_RINGBUFFER_WARNING_INTERVAL)
														
 
															 # time to wait if the queue is full or empty
														
 
															 # if we sleep for too short, it will consume too much CPU
														
--- a/aphrodite/distributed/parallel_state.py
+++ b/aphrodite/distributed/parallel_state.py
@@ -21,7 +21,6 @@ If you only need to use the distributed environment without model/pipeline
 
															  steps.
														
 
															 """
														
 
															 import contextlib
														
 
															-import os
														
 
															 import pickle
														
 
															 import sys
														
 
															 from collections import namedtuple
														
@@ -36,6 +35,8 @@ import torch.distributed
 
															 from loguru import logger
														
 
															 from torch.distributed import Backend, ProcessGroup
														
 
															+from aphrodite import envs
														
 
															+
														
 
															 @dataclass
														
 
															 class GraphCaptureContext:
														
@@ -866,7 +867,7 @@ def init_distributed_environment(
 
															         # local rank not set, this usually happens in single-node
														
 
															         # setting, where we can use rank as local rank
														
 
															         if distributed_init_method == "env://":
														
 
															-            local_rank = os.getenv("LOCAL_RANK", rank)
														
 
															+            local_rank = envs.LOCAL_RANK
														
 
															         else:
														
 
															             local_rank = rank
														
 
															     global _WORLD
														
--- a/aphrodite/distributed/utils.py
+++ b/aphrodite/distributed/utils.py
@@ -3,12 +3,13 @@
 
															 # Adapted from
														
 
															 # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py
														
 
															 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
														
 
															-import os
														
 
															 from typing import Sequence, Tuple
														
 
															 import torch
														
 
															-APHRODITE_PP_LAYER_PARTITION = os.getenv("APHRODITE_PP_LAYER_PARTITION", None)
														
 
															+from aphrodite import envs
														
 
															+
														
 
															+APHRODITE_PP_LAYER_PARTITION = envs.APHRODITE_PP_LAYER_PARTITION
														
 
															 def ensure_divisibility(numerator, denominator):
														
--- a/aphrodite/endpoints/openai/api_server.py
+++ b/aphrodite/endpoints/openai/api_server.py
@@ -20,6 +20,7 @@ from fastapi.responses import (HTMLResponse, JSONResponse, Response,
 
															 from loguru import logger
														
 
															 from starlette.routing import Mount
														
 
															+from aphrodite import envs
														
 
															 from aphrodite.common.config import ModelConfig
														
 
															 from aphrodite.common.outputs import RequestOutput
														
 
															 from aphrodite.common.sampling_params import _SAMPLING_EPS, SamplingParams
														
@@ -635,7 +636,7 @@ def build_app(args: Namespace) -> FastAPI:
 
															         return JSONResponse(err.model_dump(),
														
 
															                             status_code=HTTPStatus.BAD_REQUEST)
														
 
															-    if token := os.environ.get("APHRODITE_API_KEY") or args.api_keys:
														
 
															+    if token := envs.APHRODITE_API_KEY or args.api_keys:
														
 
															         admin_key = os.environ.get("APHRODITE_ADMIN_KEY") or args.admin_key
														
 
															         if admin_key is None:
														
--- a/aphrodite/engine/aphrodite_engine.py
+++ b/aphrodite/engine/aphrodite_engine.py
@@ -1,4 +1,3 @@
 
															-import os
														
 
															 import time
														
 
															 from contextlib import contextmanager
														
 
															 from typing import TYPE_CHECKING, Any, ClassVar, Dict, Iterable, List, Optional
														
@@ -9,6 +8,7 @@ from loguru import logger
 
															 from transformers import PreTrainedTokenizer
														
 
															 from typing_extensions import assert_never
														
 
															+from aphrodite import envs
														
 
															 from aphrodite.common.config import (CacheConfig, DecodingConfig, DeviceConfig,
														
 
															                                      EngineConfig, LoadConfig, LoRAConfig,
														
 
															                                      ModelConfig, ParallelConfig,
														
@@ -50,8 +50,7 @@ from aphrodite.version import __version__ as APHRODITE_VERSION
 
															 _LOCAL_LOGGING_INTERVAL_SEC = 5
														
 
															-APHRODITE_USE_RAY_SPMD_WORKER = bool(
														
 
															-    os.getenv("APHRODITE_USE_RAY_SPMD_WORKER", 0))
														
 
															+APHRODITE_USE_RAY_SPMD_WORKER = envs.APHRODITE_USE_RAY_SPMD_WORKER
														
 
															 def _load_generation_config_dict(model_config: ModelConfig) -> Dict[str, Any]:
														
--- a/aphrodite/engine/args_tools.py
+++ b/aphrodite/engine/args_tools.py
@@ -1,13 +1,13 @@
 
															 import argparse
														
 
															 import dataclasses
														
 
															 import json
														
 
															-import os
														
 
															 from dataclasses import dataclass
														
 
															 from typing import (TYPE_CHECKING, Dict, List, Mapping, Optional, Tuple, Type,
														
 
															                     Union)
														
 
															 from loguru import logger
														
 
															+from aphrodite import envs
														
 
															 from aphrodite.common.config import (CacheConfig, ConfigFormat, DecodingConfig,
														
 
															                                      DeviceConfig, EngineConfig, LoadConfig,
														
 
															                                      LoadFormat, LoRAConfig, ModelConfig,
														
@@ -24,8 +24,7 @@ if TYPE_CHECKING:
 
															     from aphrodite.transformers_utils.tokenizer_group import BaseTokenizerGroup
														
 
															-APHRODITE_USE_RAY_SPMD_WORKER = bool(
														
 
															-    os.getenv("APHRODITE_USE_RAY_SPMD_WORKER", 0))
														
 
															+APHRODITE_USE_RAY_SPMD_WORKER = envs.APHRODITE_USE_RAY_SPMD_WORKER
														
 
															 def nullable_kvs(val: str) -> Optional[Mapping[str, int]]:
														
 
															     if len(val) == 0:
														
--- a/aphrodite/engine/async_aphrodite.py
+++ b/aphrodite/engine/async_aphrodite.py
@@ -1,5 +1,4 @@
 
															 import asyncio
														
 
															-import os
														
 
															 import time
														
 
															 from dataclasses import dataclass
														
 
															 from functools import partial
														
@@ -11,6 +10,7 @@ from loguru import logger
 
															 from transformers import PreTrainedTokenizer
														
 
															 from typing_extensions import assert_never
														
 
															+from aphrodite import envs
														
 
															 from aphrodite.common.config import (DecodingConfig, EngineConfig, LoRAConfig,
														
 
															                                      ModelConfig, ParallelConfig,
														
 
															                                      SchedulerConfig)
														
@@ -34,8 +34,7 @@ from aphrodite.lora.request import LoRARequest
 
															 from aphrodite.processing.scheduler import SchedulerOutputs
														
 
															 from aphrodite.prompt_adapter.request import PromptAdapterRequest
														
 
															-ENGINE_ITERATION_TIMEOUT_S = int(
														
 
															-    os.environ.get("APHRODITE_ENGINE_ITERATION_TIMEOUT_S", "60"))
														
 
															+ENGINE_ITERATION_TIMEOUT_S = envs.APHRODITE_ENGINE_ITERATION_TIMEOUT_S
														
 
															 class AsyncEngineDeadError(RuntimeError):
														
--- a/aphrodite/envs.py
+++ b/aphrodite/envs.py
@@ -0,0 +1,388 @@
 
															+import os
														
 
															+import tempfile
														
 
															+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
														
 
															+
														
 
															+if TYPE_CHECKING:
														
 
															+    APHRODITE_HOST_IP: str = ""
														
 
															+    APHRODITE_PORT: Optional[int] = None
														
 
															+    APHRODITE_RPC_BASE_PATH: str = tempfile.gettempdir()
														
 
															+    APHRODITE_USE_MODELSCOPE: bool = False
														
 
															+    APHRODITE_RINGBUFFER_WARNING_INTERVAL: int = 60
														
 
															+    APHRODITE_INSTANCE_ID: Optional[str] = None
														
 
															+    APHRODITE_NCCL_SO_PATH: Optional[str] = None
														
 
															+    LD_LIBRARY_PATH: Optional[str] = None
														
 
															+    APHRODITE_USE_TRITON_FLASH_ATTN: bool = False
														
 
															+    LOCAL_RANK: int = 0
														
 
															+    CUDA_VISIBLE_DEVICES: Optional[str] = None
														
 
															+    APHRODITE_ENGINE_ITERATION_TIMEOUT_S: int = 60
														
 
															+    APHRODITE_API_KEY: Optional[str] = None
														
 
															+    S3_ACCESS_KEY_ID: Optional[str] = None
														
 
															+    S3_SECRET_ACCESS_KEY: Optional[str] = None
														
 
															+    S3_ENDPOINT_URL: Optional[str] = None
														
 
															+    APHRODITE_CACHE_ROOT: str = os.path.expanduser("~/.cache/aphrodite")
														
 
															+    APHRODITE_CONFIG_ROOT: str = os.path.expanduser("~/.config/aphrodite")
														
 
															+    APHRODITE_CONFIGURE_LOGGING: int = 1
														
 
															+    APHRODITE_LOGGING_LEVEL: str = "INFO"
														
 
															+    APHRODITE_LOGGING_CONFIG_PATH: Optional[str] = None
														
 
															+    APHRODITE_TRACE_FUNCTION: int = 0
														
 
															+    APHRODITE_ATTENTION_BACKEND: Optional[str] = None
														
 
															+    APHRODITE_USE_SAMPLING_KERNELS: bool = False
														
 
															+    APHRODITE_PP_LAYER_PARTITION: Optional[str] = None
														
 
															+    APHRODITE_CPU_KVCACHE_SPACE: int = 0
														
 
															+    APHRODITE_CPU_OMP_THREADS_BIND: str = ""
														
 
															+    APHRODITE_OPENVINO_KVCACHE_SPACE: int = 0
														
 
															+    APHRODITE_OPENVINO_CPU_KV_CACHE_PRECISION: Optional[str] = None
														
 
															+    APHRODITE_OPENVINO_ENABLE_QUANTIZED_WEIGHTS: bool = False
														
 
															+    APHRODITE_XLA_CACHE_PATH: str = os.path.join(APHRODITE_CACHE_ROOT, "xla_cache")  # noqa: E501
														
 
															+    APHRODITE_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024
														
 
															+    APHRODITE_USE_RAY_SPMD_WORKER: bool = False
														
 
															+    APHRODITE_USE_RAY_COMPILED_DAG: bool = False
														
 
															+    APHRODITE_USE_RAY_COMPILED_DAG_NCCL_CHANNEL: bool = True
														
 
															+    APHRODITE_WORKER_MULTIPROC_METHOD: str = "fork"
														
 
															+    APHRODITE_ASSETS_CACHE: str = os.path.join(APHRODITE_CACHE_ROOT, "assets")
														
 
															+    APHRODITE_IMAGE_FETCH_TIMEOUT: int = 5
														
 
															+    APHRODITE_AUDIO_FETCH_TIMEOUT: int = 5
														
 
															+    APHRODITE_TARGET_DEVICE: str = "cuda"
														
 
															+    MAX_JOBS: Optional[str] = None
														
 
															+    NVCC_THREADS: Optional[str] = None
														
 
															+    APHRODITE_USE_PRECOMPILED: bool = False
														
 
															+    APHRODITE_NO_DEPRECATION_WARNING: bool = False
														
 
															+    APHRODITE_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
														
 
															+    CMAKE_BUILD_TYPE: Optional[str] = None
														
 
															+    VERBOSE: bool = False
														
 
															+    APHRODITE_DYNAMIC_ROPE_SCALING: bool = False
														
 
															+    APHRODITE_TEST_FORCE_FP8_MARLIN: bool = False
														
 
															+    APHRODITE_ALLOW_ENGINE_USE_RAY: bool = False
														
 
															+    APHRODITE_PLUGINS: Optional[List[str]] = None
														
 
															+
														
 
															+
														
 
															+def get_default_cache_root():
														
 
															+    return os.getenv(
														
 
															+        "XDG_CACHE_HOME",
														
 
															+        os.path.join(os.path.expanduser("~"), ".cache"),
														
 
															+    )
														
 
															+
														
 
															+
														
 
															+def get_default_config_root():
														
 
															+    return os.getenv(
														
 
															+        "XDG_CONFIG_HOME",
														
 
															+        os.path.join(os.path.expanduser("~"), ".config"),
														
 
															+    )
														
 
															+
														
 
															+
														
 
															+# The begin-* and end* here are used by the documentation generator
														
 
															+# to extract the used env vars.
														
 
															+
														
 
															+# begin-env-vars-definition
														
 
															+
														
 
															+environment_variables: Dict[str, Callable[[], Any]] = {
														
 
															+
														
 
															+    # ================== Installation Time Env Vars ==================
														
 
															+
														
 
															+    # Target device of Aphrodite, supporting [cuda (by default),
														
 
															+    # rocm, neuron, cpu, openvino]
														
 
															+    "APHRODITE_TARGET_DEVICE":
														
 
															+    lambda: os.getenv("APHRODITE_TARGET_DEVICE", "cuda"),
														
 
															+
														
 
															+    # Maximum number of compilation jobs to run in parallel.
														
 
															+    # By default this is the number of CPUs
														
 
															+    "MAX_JOBS":
														
 
															+    lambda: os.getenv("MAX_JOBS", None),
														
 
															+
														
 
															+    # Number of threads to use for nvcc
														
 
															+    # By default this is 1.
														
 
															+    # If set, `MAX_JOBS` will be reduced to avoid oversubscribing the CPU.
														
 
															+    "NVCC_THREADS":
														
 
															+    lambda: os.getenv("NVCC_THREADS", None),
														
 
															+
														
 
															+    # If set, Aphrodite will use precompiled binaries (*.so)
														
 
															+    "APHRODITE_USE_PRECOMPILED":
														
 
															+    lambda: bool(os.environ.get("APHRODITE_USE_PRECOMPILED")),
														
 
															+
														
 
															+    # CMake build type
														
 
															+    # If not set, defaults to "Debug" or "RelWithDebInfo"
														
 
															+    # Available options: "Debug", "Release", "RelWithDebInfo"
														
 
															+    "CMAKE_BUILD_TYPE":
														
 
															+    lambda: os.getenv("CMAKE_BUILD_TYPE"),
														
 
															+
														
 
															+    # If set, Aphrodite will print verbose logs during installation
														
 
															+    "VERBOSE":
														
 
															+    lambda: bool(int(os.getenv('VERBOSE', '0'))),
														
 
															+
														
 
															+    # Root directory for APHRODITE configuration files
														
 
															+    # Defaults to `~/.config/aphrodite` unless `XDG_CONFIG_HOME` is set
														
 
															+    # Note that this not only affects how aphrodite finds its configuration
														
 
															+    # files during runtime, but also affects how aphrodite installs its
														
 
															+    # configuration files during **installation**.
														
 
															+    "APHRODITE_CONFIG_ROOT":
														
 
															+    lambda: os.path.expanduser(
														
 
															+        os.getenv(
														
 
															+            "APHRODITE_CONFIG_ROOT",
														
 
															+            os.path.join(get_default_config_root(), "aphrodite"),
														
 
															+        )),
														
 
															+
														
 
															+    # ================== Runtime Env Vars ==================
														
 
															+
														
 
															+    # Root directory for APHRODITE cache files
														
 
															+    # Defaults to `~/.cache/aphrodite` unless `XDG_CACHE_HOME` is set
														
 
															+    "APHRODITE_CACHE_ROOT":
														
 
															+    lambda: os.path.expanduser(
														
 
															+        os.getenv(
														
 
															+            "APHRODITE_CACHE_ROOT",
														
 
															+            os.path.join(get_default_cache_root(), "aphrodite"),
														
 
															+        )),
														
 
															+
														
 
															+    # used in distributed environment to determine the master address
														
 
															+    'APHRODITE_HOST_IP':
														
 
															+    lambda: os.getenv('APHRODITE_HOST_IP', "") or os.getenv("HOST_IP", ""),
														
 
															+
														
 
															+    # used in distributed environment to manually set the communication port
														
 
															+    # Note: if APHRODITE_PORT is set, and some code asks for multiple ports, the
														
 
															+    # APHRODITE_PORT will be used as the first port, and the rest will be
														
 
															+    # generated by incrementing the APHRODITE_PORT value.
														
 
															+    # '0' is used to make mypy happy
														
 
															+    'APHRODITE_PORT':
														
 
															+    lambda: int(os.getenv('APHRODITE_PORT', '0'))
														
 
															+    if 'APHRODITE_PORT' in os.environ else None,
														
 
															+
														
 
															+    # path used for ipc when the frontend api server is running in
														
 
															+    # multi-processing mode to communicate with the backend engine process.
														
 
															+    'APHRODITE_RPC_BASE_PATH':
														
 
															+    lambda: os.getenv('APHRODITE_RPC_BASE_PATH', tempfile.gettempdir()),
														
 
															+
														
 
															+    # If true, will load models from ModelScope instead of Hugging Face Hub.
														
 
															+    # note that the value is true or false, not numbers
														
 
															+    "APHRODITE_USE_MODELSCOPE":
														
 
															+    lambda: os.environ.get(
														
 
															+        "APHRODITE_USE_MODELSCOPE", "False").lower() == "true",
														
 
															+
														
 
															+    # Instance id represents an instance of the APHRODITE. All processes in the
														
 
															+    # same instance should have the same instance id.
														
 
															+    "APHRODITE_INSTANCE_ID":
														
 
															+    lambda: os.environ.get("APHRODITE_INSTANCE_ID", None),
														
 
															+
														
 
															+    # Interval in seconds to log a warning message when the ring buffer is full
														
 
															+    "APHRODITE_RINGBUFFER_WARNING_INTERVAL":
														
 
															+    lambda: int(os.environ.get("APHRODITE_RINGBUFFER_WARNING_INTERVAL", "60")),
														
 
															+
														
 
															+    # path to cudatoolkit home directory, under which should be bin, include,
														
 
															+    # and lib directories.
														
 
															+    "CUDA_HOME":
														
 
															+    lambda: os.environ.get("CUDA_HOME", None),
														
 
															+
														
 
															+    # Path to the NCCL library file. It is needed because nccl>=2.19 brought
														
 
															+    # by PyTorch contains a bug: https://github.com/NVIDIA/nccl/issues/1234
														
 
															+    "APHRODITE_NCCL_SO_PATH":
														
 
															+    lambda: os.environ.get("APHRODITE_NCCL_SO_PATH", None),
														
 
															+
														
 
															+    # when `APHRODITE_NCCL_SO_PATH` is not set, aphrodite will try to find the
														
 
															+    # nccl library file in the locations specified by `LD_LIBRARY_PATH`
														
 
															+    "LD_LIBRARY_PATH":
														
 
															+    lambda: os.environ.get("LD_LIBRARY_PATH", None),
														
 
															+
														
 
															+    # flag to control if aphrodite should use triton flash attention
														
 
															+    "APHRODITE_USE_TRITON_FLASH_ATTN":
														
 
															+    lambda: (os.environ.get(
														
 
															+        "APHRODITE_USE_TRITON_FLASH_ATTN", "True").lower() in ("true", "1")),
														
 
															+
														
 
															+    # Internal flag to enable Dynamo graph capture
														
 
															+    "APHRODITE_TEST_DYNAMO_GRAPH_CAPTURE":
														
 
															+    lambda: int(os.environ.get("APHRODITE_TEST_DYNAMO_GRAPH_CAPTURE", "0")),
														
 
															+
														
 
															+    # local rank of the process in the distributed setting, used to determine
														
 
															+    # the GPU device id
														
 
															+    "LOCAL_RANK":
														
 
															+    lambda: int(os.environ.get("LOCAL_RANK", "0")),
														
 
															+
														
 
															+    # used to control the visible devices in the distributed setting
														
 
															+    "CUDA_VISIBLE_DEVICES":
														
 
															+    lambda: os.environ.get("CUDA_VISIBLE_DEVICES", None),
														
 
															+
														
 
															+    # timeout for each iteration in the engine
														
 
															+    "APHRODITE_ENGINE_ITERATION_TIMEOUT_S":
														
 
															+    lambda: int(os.environ.get("APHRODITE_ENGINE_ITERATION_TIMEOUT_S", "60")),
														
 
															+
														
 
															+    # API key for APHRODITE API server
														
 
															+    "APHRODITE_API_KEY":
														
 
															+    lambda: os.environ.get("APHRODITE_API_KEY", None),
														
 
															+
														
 
															+    # S3 access information, used for tensorizer to load model from S3
														
 
															+    "S3_ACCESS_KEY_ID":
														
 
															+    lambda: os.environ.get("S3_ACCESS_KEY_ID", None),
														
 
															+    "S3_SECRET_ACCESS_KEY":
														
 
															+    lambda: os.environ.get("S3_SECRET_ACCESS_KEY", None),
														
 
															+    "S3_ENDPOINT_URL":
														
 
															+    lambda: os.environ.get("S3_ENDPOINT_URL", None),
														
 
															+
														
 
															+    # Logging configuration
														
 
															+    # If set to 0, aphrodite will not configure logging
														
 
															+    # If set to 1, aphrodite will configure logging using the default
														
 
															+    # configuration or the configuration file specified by
														
 
															+    # APHRODITE_LOGGING_CONFIG_PATH
														
 
															+    "APHRODITE_CONFIGURE_LOGGING":
														
 
															+    lambda: int(os.getenv("APHRODITE_CONFIGURE_LOGGING", "1")),
														
 
															+    "APHRODITE_LOGGING_CONFIG_PATH":
														
 
															+    lambda: os.getenv("APHRODITE_LOGGING_CONFIG_PATH"),
														
 
															+
														
 
															+    # this is used for configuring the default logging level
														
 
															+    "APHRODITE_LOGGING_LEVEL":
														
 
															+    lambda: os.getenv("APHRODITE_LOGGING_LEVEL", "INFO"),
														
 
															+
														
 
															+    # Trace function calls
														
 
															+    # If set to 1, aphrodite will trace function calls
														
 
															+    # Useful for debugging
														
 
															+    "APHRODITE_TRACE_FUNCTION":
														
 
															+    lambda: int(os.getenv("APHRODITE_TRACE_FUNCTION", "0")),
														
 
															+
														
 
															+    # Backend for attention computation
														
 
															+    # Available options:
														
 
															+    # - "TORCH_SDPA": use torch.nn.MultiheadAttention
														
 
															+    # - "FLASH_ATTN": use FlashAttention
														
 
															+    # - "XFORMERS": use XFormers
														
 
															+    # - "ROCM_FLASH": use ROCmFlashAttention
														
 
															+    # - "FLASHINFER": use flashinfer
														
 
															+    "APHRODITE_ATTENTION_BACKEND":
														
 
															+    lambda: os.getenv("APHRODITE_ATTENTION_BACKEND", None),
														
 
															+
														
 
															+    # If set, aphrodite will use flashinfer sampler
														
 
															+    "APHRODITE_USE_SAMPLING_KERNELS":
														
 
															+    lambda: bool(int(os.getenv("APHRODITE_USE_SAMPLING_KERNELS", "0"))),
														
 
															+
														
 
															+    # Pipeline stage partition strategy
														
 
															+    "APHRODITE_PP_LAYER_PARTITION":
														
 
															+    lambda: os.getenv("APHRODITE_PP_LAYER_PARTITION", None),
														
 
															+
														
 
															+    # (CPU backend only) CPU key-value cache space.
														
 
															+    # default is 4GB
														
 
															+    "APHRODITE_CPU_KVCACHE_SPACE":
														
 
															+    lambda: int(os.getenv("APHRODITE_CPU_KVCACHE_SPACE", "0")),
														
 
															+
														
 
															+    # (CPU backend only) CPU core ids bound by OpenMP threads, e.g., "0-31",
														
 
															+    # "0,1,2", "0-31,33". CPU cores of different ranks are separated by '|'.
														
 
															+    "APHRODITE_CPU_OMP_THREADS_BIND":
														
 
															+    lambda: os.getenv("APHRODITE_CPU_OMP_THREADS_BIND", "all"),
														
 
															+
														
 
															+    # OpenVINO key-value cache space
														
 
															+    # default is 4GB
														
 
															+    "APHRODITE_OPENVINO_KVCACHE_SPACE":
														
 
															+    lambda: int(os.getenv("APHRODITE_OPENVINO_KVCACHE_SPACE", "0")),
														
 
															+
														
 
															+    # OpenVINO KV cache precision
														
 
															+    # default is bf16 if natively supported by platform, otherwise f16
														
 
															+    # To enable KV cache compression, please, explicitly specify u8
														
 
															+    "APHRODITE_OPENVINO_CPU_KV_CACHE_PRECISION":
														
 
															+    lambda: os.getenv("APHRODITE_OPENVINO_CPU_KV_CACHE_PRECISION", None),
														
 
															+
														
 
															+    # Enables weights compression during model export via HF Optimum
														
 
															+    # default is False
														
 
															+    "APHRODITE_OPENVINO_ENABLE_QUANTIZED_WEIGHTS":
														
 
															+    lambda: bool(os.getenv(
														
 
															+        "APHRODITE_OPENVINO_ENABLE_QUANTIZED_WEIGHTS", False)),
														
 
															+
														
 
															+    # If the env var is set, then all workers will execute as separate
														
 
															+    # processes from the engine, and we use the same mechanism to trigger
														
 
															+    # execution on all workers.
														
 
															+    # Run aphrodite with APHRODITE_USE_RAY_SPMD_WORKER=1 to enable it.
														
 
															+    "APHRODITE_USE_RAY_SPMD_WORKER":
														
 
															+    lambda: bool(int(os.getenv("APHRODITE_USE_RAY_SPMD_WORKER", "0"))),
														
 
															+
														
 
															+    # If the env var is set, it uses the Ray's compiled DAG API
														
 
															+    # which optimizes the control plane overhead.
														
 
															+    # Run aphrodite with APHRODITE_USE_RAY_COMPILED_DAG=1 to enable it.
														
 
															+    "APHRODITE_USE_RAY_COMPILED_DAG":
														
 
															+    lambda: bool(int(os.getenv("APHRODITE_USE_RAY_COMPILED_DAG", "0"))),
														
 
															+
														
 
															+    # If the env var is set, it uses NCCL for communication in
														
 
															+    # Ray's compiled DAG. This flag is ignored if
														
 
															+    # APHRODITE_USE_RAY_COMPILED_DAG is not set.
														
 
															+    "APHRODITE_USE_RAY_COMPILED_DAG_NCCL_CHANNEL":
														
 
															+    lambda: bool(int(
														
 
															+        os.getenv("APHRODITE_USE_RAY_COMPILED_DAG_NCCL_CHANNEL", "1"))),
														
 
															+
														
 
															+    # Use dedicated multiprocess context for workers.
														
 
															+    # Both spawn and fork work
														
 
															+    "APHRODITE_WORKER_MULTIPROC_METHOD":
														
 
															+    lambda: os.getenv("APHRODITE_WORKER_MULTIPROC_METHOD", "fork"),
														
 
															+
														
 
															+    # Path to the cache for storing downloaded assets
														
 
															+    "APHRODITE_ASSETS_CACHE":
														
 
															+    lambda: os.path.expanduser(
														
 
															+        os.getenv(
														
 
															+            "APHRODITE_ASSETS_CACHE",
														
 
															+            os.path.join(get_default_cache_root(), "aphrodite", "assets"),
														
 
															+        )),
														
 
															+
														
 
															+    # Timeout for fetching images when serving multimodal models
														
 
															+    # Default is 5 seconds
														
 
															+    "APHRODITE_IMAGE_FETCH_TIMEOUT":
														
 
															+    lambda: int(os.getenv("APHRODITE_IMAGE_FETCH_TIMEOUT", "5")),
														
 
															+
														
 
															+    # Timeout for fetching audio when serving multimodal models
														
 
															+    # Default is 5 seconds
														
 
															+    "APHRODITE_AUDIO_FETCH_TIMEOUT":
														
 
															+    lambda: int(os.getenv("APHRODITE_AUDIO_FETCH_TIMEOUT", "5")),
														
 
															+
														
 
															+    # Path to the XLA persistent cache directory.
														
 
															+    # Only used for XLA devices such as TPUs.
														
 
															+    "APHRODITE_XLA_CACHE_PATH":
														
 
															+    lambda: os.path.expanduser(
														
 
															+        os.getenv(
														
 
															+            "APHRODITE_XLA_CACHE_PATH",
														
 
															+            os.path.join(get_default_cache_root(), "aphrodite", "xla_cache"),
														
 
															+        )),
														
 
															+    "APHRODITE_FUSED_MOE_CHUNK_SIZE":
														
 
															+    lambda: int(os.getenv("APHRODITE_FUSED_MOE_CHUNK_SIZE", "65536")),
														
 
															+
														
 
															+    # If set, aphrodite will skip the deprecation warnings.
														
 
															+    "APHRODITE_NO_DEPRECATION_WARNING":
														
 
															+    lambda: bool(int(os.getenv("APHRODITE_NO_DEPRECATION_WARNING", "0"))),
														
 
															+
														
 
															+    # If set, the OpenAI API server will stay alive even after the underlying
														
 
															+    # AsyncLLMEngine errors and stops serving requests
														
 
															+    "APHRODITE_KEEP_ALIVE_ON_ENGINE_DEATH":
														
 
															+    lambda: bool(os.getenv("APHRODITE_KEEP_ALIVE_ON_ENGINE_DEATH", 0)),
														
 
															+
														
 
															+    # If the env var APHRODITE_DYNAMIC_ROPE_SCALING is set, it allows
														
 
															+    # the user to specify a max sequence length greater than
														
 
															+    # the max length derived from the model's config.json.
														
 
															+    # To enable this, set APHRODITE_DYNAMIC_ROPE_SCALING=1.
														
 
															+    "APHRODITE_DYNAMIC_ROPE_SCALING":
														
 
															+    lambda:
														
 
															+    (os.environ.get(
														
 
															+        "APHRODITE_DYNAMIC_ROPE_SCALING",
														
 
															+        "0").strip().lower() in ("1", "true")),
														
 
															+
														
 
															+    # If set, forces FP8 Marlin to be used for FP8 quantization regardless
														
 
															+    # of the hardware support for FP8 compute.
														
 
															+    "APHRODITE_TEST_FORCE_FP8_MARLIN":
														
 
															+    lambda:
														
 
															+    (os.environ.get("APHRODITE_TEST_FORCE_FP8_MARLIN", "0").strip().lower() in
														
 
															+     ("1", "true")),
														
 
															+
														
 
															+    # If set, allow running the engine as a separate ray actor,
														
 
															+    # which is a deprecated feature soon to be removed.
														
 
															+    "APHRODITE_ALLOW_ENGINE_USE_RAY":
														
 
															+    lambda:
														
 
															+    (os.environ.get("APHRODITE_ALLOW_ENGINE_USE_RAY", "0").strip().lower() in
														
 
															+     ("1", "true")),
														
 
															+
														
 
															+    # a list of plugin names to load, separated by commas.
														
 
															+    # if this is not set, it means all plugins will be loaded
														
 
															+    # if this is set to an empty string, no plugins will be loaded
														
 
															+    "APHRODITE_PLUGINS":
														
 
															+    lambda: None if "APHRODITE_PLUGINS" not in os.environ else os.environ[
														
 
															+        "APHRODITE_PLUGINS"].split(","),
														
 
															+}
														
 
															+
														
 
															+# end-env-vars-definition
														
 
															+
														
 
															+
														
 
															+def __getattr__(name: str):
														
 
															+    # lazy evaluation of environment variables
														
 
															+    if name in environment_variables:
														
 
															+        return environment_variables[name]()
														
 
															+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
														
 
															+
														
 
															+
														
 
															+def __dir__():
														
 
															+    return list(environment_variables.keys())
														
--- a/aphrodite/executor/cpu_executor.py
+++ b/aphrodite/executor/cpu_executor.py
@@ -5,6 +5,7 @@ from typing import Any, Awaitable, List, Optional, Set, Tuple, Union
 
															 import torch
														
 
															 from loguru import logger
														
 
															+from aphrodite import envs
														
 
															 from aphrodite.common.config import CacheConfig, ModelConfig, SchedulerConfig
														
 
															 from aphrodite.common.sequence import ExecuteModelRequest, SamplerOutput
														
 
															 from aphrodite.common.utils import (GiB_bytes, get_aphrodite_instance_id,
														
@@ -333,7 +334,7 @@ def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig:
 
															         logger.warning("Prefix caching is not supported on CPU, disable it.")
														
 
															         config.enable_prefix_caching = False
														
 
															-    kv_cache_space_str = os.getenv("APHRODITE_CPU_KVCACHE_SPACE", "0")
														
 
															+    kv_cache_space_str = envs.APHRODITE_CPU_KVCACHE_SPACE
														
 
															     kv_cache_space = int(kv_cache_space_str)
														
 
															     if kv_cache_space >= 0:
														
--- a/aphrodite/executor/multiproc_worker_utils.py
+++ b/aphrodite/executor/multiproc_worker_utils.py
@@ -14,6 +14,8 @@ from typing import (Any, Callable, Dict, Generic, List, Optional, TextIO,
 
															 from loguru import logger
														
 
															+from aphrodite import envs
														
 
															+
														
 
															 T = TypeVar('T')
														
 
															 _TERMINATE = "TERMINATE"  # sentinel
														
@@ -26,7 +28,7 @@ JOIN_TIMEOUT_S = 2
 
															 # Use dedicated multiprocess context for workers.
														
 
															 # Both spawn and fork work
														
 
															-mp_method = os.getenv("APHRODITE_WORKER_MULTIPROC_METHOD", "fork")
														
 
															+mp_method = envs.APHRODITE_WORKER_MULTIPROC_METHOD
														
 
															 mp = multiprocessing.get_context(mp_method)
														
--- a/aphrodite/executor/openvino_executor.py
+++ b/aphrodite/executor/openvino_executor.py
@@ -1,4 +1,3 @@
 
															-import os
														
 
															 from typing import List, Set, Tuple
														
 
															 import openvino as ov
														
@@ -6,6 +5,7 @@ import openvino.properties.hint as hints
 
															 import torch
														
 
															 from loguru import logger
														
 
															+from aphrodite import envs
														
 
															 from aphrodite.common.config import CacheConfig, ModelConfig
														
 
															 from aphrodite.common.sequence import ExecuteModelRequest, SamplerOutput
														
 
															 from aphrodite.common.utils import (GiB_bytes, get_distributed_init_method,
														
@@ -13,10 +13,9 @@ from aphrodite.common.utils import (GiB_bytes, get_distributed_init_method,
 
															 from aphrodite.executor.executor_base import ExecutorAsyncBase, ExecutorBase
														
 
															 from aphrodite.lora.request import LoRARequest
														
 
															-APHRODITE_OPENVINO_KVCACHE_SPACE = int(
														
 
															-    os.getenv("APHRODITE_OPENVINO_KVCACHE_SPACE", 0))
														
 
															-APHRODITE_OPENVINO_CPU_KV_CACHE_PRECISION = os.getenv(
														
 
															-    "APHRODITE_OPENVINO_CPU_KV_CACHE_PRECISION", None)
														
 
															+APHRODITE_OPENVINO_KVCACHE_SPACE = envs.APHRODITE_OPENVINO_KVCACHE_SPACE
														
 
															+APHRODITE_OPENVINO_CPU_KV_CACHE_PRECISION = (
														
 
															+    envs.APHRODITE_OPENVINO_CPU_KV_CACHE_PRECISION)
														
 
															 class OpenVINOExecutor(ExecutorBase):
														
--- a/aphrodite/executor/ray_gpu_executor.py
+++ b/aphrodite/executor/ray_gpu_executor.py
@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
 
															 import msgspec
														
 
															 from loguru import logger
														
 
															+from aphrodite import envs
														
 
															 from aphrodite.common.sequence import ExecuteModelRequest, SamplerOutput
														
 
															 from aphrodite.common.utils import (_run_task_with_lock,
														
 
															                                     get_aphrodite_instance_id,
														
@@ -26,14 +27,12 @@ if TYPE_CHECKING:
 
															 # If the env var is set, it uses the Ray's compiled DAG API
														
 
															 # which optimizes the control plane overhead.
														
 
															 # Run Aphrodite with APHRODITE_USE_RAY_COMPILED_DAG=1 to enable it.
														
 
															-APHRODITE_USE_RAY_COMPILED_DAG = bool(
														
 
															-    os.getenv("APHRODITE_USE_RAY_COMPILED_DAG", 0))
														
 
															-APHRODITE_TRACE_FUNCTION = int(os.getenv("APHRODITE_TRACE_FUNCTION", 0))
														
 
															-APHRODITE_USE_RAY_SPMD_WORKER = bool(
														
 
															-    os.getenv("APHRODITE_USE_RAY_SPMD_WORKER", 0))
														
 
															-
														
 
															-APHRODITE_USE_RAY_COMPILED_DAG_NCCL_CHANNEL = bool(
														
 
															-    int(os.getenv("APHRODITE_USE_RAY_COMPILED_DAG_NCCL_CHANNEL", 1)))
														
 
															+APHRODITE_USE_RAY_COMPILED_DAG = envs.APHRODITE_USE_RAY_COMPILED_DAG
														
 
															+APHRODITE_TRACE_FUNCTION = envs.APHRODITE_TRACE_FUNCTION
														
 
															+APHRODITE_USE_RAY_SPMD_WORKER = envs.APHRODITE_USE_RAY_SPMD_WORKER
														
 
															+
														
 
															+APHRODITE_USE_RAY_COMPILED_DAG_NCCL_CHANNEL = (
														
 
															+    envs.APHRODITE_USE_RAY_COMPILED_DAG_NCCL_CHANNEL)
														
 
															 class RayGPUExecutor(DistributedGPUExecutor):
														
--- a/aphrodite/executor/ray_tpu_executor.py
+++ b/aphrodite/executor/ray_tpu_executor.py
@@ -7,6 +7,7 @@ from typing import (TYPE_CHECKING, Any, Awaitable, Dict, List, Optional, Tuple,
 
															 from loguru import logger
														
 
															+from aphrodite import envs
														
 
															 from aphrodite.common.sequence import ExecuteModelRequest, SamplerOutput
														
 
															 from aphrodite.common.utils import (get_aphrodite_instance_id,
														
 
															                                     get_distributed_init_method, get_ip,
														
@@ -21,7 +22,7 @@ if ray is not None:
 
															 if TYPE_CHECKING:
														
 
															     from ray.util.placement_group import PlacementGroup
														
 
															-APHRODITE_TRACE_FUNCTION = int(os.getenv("APHRODITE_TRACE_FUNCTION", 0))
														
 
															+APHRODITE_TRACE_FUNCTION = envs.APHRODITE_TRACE_FUNCTION
														
 
															 class RayTPUExecutor(TPUExecutor):
														
--- a/aphrodite/executor/ray_xpu_executor.py
+++ b/aphrodite/executor/ray_xpu_executor.py
@@ -7,6 +7,7 @@ from typing import (TYPE_CHECKING, Any, Awaitable, Dict, List, Optional, Set,
 
															 from loguru import logger
														
 
															+from aphrodite import envs
														
 
															 from aphrodite.common.config import (CacheConfig, DeviceConfig, LoadConfig,
														
 
															                                      LoRAConfig, ModelConfig, ParallelConfig,
														
 
															                                      PromptAdapterConfig, SchedulerConfig,
														
@@ -28,7 +29,7 @@ if TYPE_CHECKING:
 
															 # If the env var is set, it uses the Ray's compiled DAG API
														
 
															 # which optimizes the control plane overhead.
														
 
															 # Run Aphrodite with APHRODITE_USE_RAY_COMPILED_DAG=1 to enable it.
														
 
															-USE_RAY_COMPILED_DAG = bool(os.getenv("APHRODITE_USE_RAY_COMPILED_DAG", 0))
														
 
															+USE_RAY_COMPILED_DAG = envs.APHRODITE_USE_RAY_COMPILED_DAG
														
 
															 class RayXPUExecutor(DistributedGPUExecutor):
														
--- a/aphrodite/modeling/layers/fused_moe/fused_moe.py
+++ b/aphrodite/modeling/layers/fused_moe/fused_moe.py
@@ -10,10 +10,10 @@ import triton.language as tl
 
															 from loguru import logger
														
 
															 from aphrodite import _custom_ops as ops
														
 
															+from aphrodite import envs
														
 
															 from aphrodite.platforms import current_platform
														
 
															-APHRODITE_FUSED_MOE_CHUNK_SIZE = int(
														
 
															-    os.getenv("APHRODITE_FUSED_MOE_CHUNK_SIZE", "65536"))
														
 
															+APHRODITE_FUSED_MOE_CHUNK_SIZE = envs.APHRODITE_FUSED_MOE_CHUNK_SIZE
														
 
															 @triton.jit
														
--- a/aphrodite/modeling/layers/sampler.py
+++ b/aphrodite/modeling/layers/sampler.py
@@ -1,6 +1,5 @@
 
															 """A layer that samples the next tokens from the model's outputs."""
														
 
															 import itertools
														
 
															-import os
														
 
															 import warnings
														
 
															 from enum import IntEnum
														
 
															 from math import inf
														
@@ -11,6 +10,7 @@ import torch.nn as nn
 
															 from loguru import logger
														
 
															 import aphrodite._custom_ops as ops
														
 
															+from aphrodite import envs
														
 
															 from aphrodite.common.sampling_params import SamplingType
														
 
															 from aphrodite.common.sequence import (CompletionSequenceGroupOutput, Logprob,
														
 
															                                        PromptLogprobs, SampleLogprobs,
														
@@ -34,8 +34,7 @@ _TEMPERATURE_MINIMUM = 2e-5
 
															 # If enabled, we switch to a more performant implementation
														
 
															 # of top-k and top-p
														
 
															-APHRODITE_USE_SAMPLING_KERNELS = bool(int(
														
 
															-    os.getenv("APHRODITE_USE_SAMPLING_KERNELS", "0")))
														
 
															+APHRODITE_USE_SAMPLING_KERNELS = envs.APHRODITE_USE_SAMPLING_KERNELS
														
 
															 class SamplerID(IntEnum):
														
--- a/aphrodite/modeling/model_loader/openvino.py
+++ b/aphrodite/modeling/model_loader/openvino.py
@@ -1,5 +1,4 @@
 
															 # ruff: noqa: SIM117
														
 
															-import os
														
 
															 from pathlib import Path
														
 
															 from typing import List, Optional, Tuple
														
@@ -11,6 +10,7 @@ from openvino._offline_transformations import paged_attention_transformation
 
															 from optimum.intel import OVModelForCausalLM
														
 
															 from torch import nn
														
 
															+from aphrodite import envs
														
 
															 from aphrodite.attention.backends.openvino import OpenVINOAttentionMetadata
														
 
															 from aphrodite.common.config import DeviceConfig, ModelConfig
														
 
															 from aphrodite.common.sequence import SamplerOutput
														
@@ -19,8 +19,8 @@ from aphrodite.modeling.layers.logits_processor import (LogitsProcessor,
 
															 from aphrodite.modeling.layers.sampler import Sampler
														
 
															 from aphrodite.modeling.sampling_metadata import SamplingMetadata
														
 
															-APHRODITE_OPENVINO_ENABLE_QUANTIZED_WEIGHTS = bool(
														
 
															-    os.getenv("APHRODITE_OPENVINO_ENABLE_QUANTIZED_WEIGHTS", False))
														
 
															+APHRODITE_OPENVINO_ENABLE_QUANTIZED_WEIGHTS = (
														
 
															+    envs.APHRODITE_OPENVINO_ENABLE_QUANTIZED_WEIGHTS)
														
 
															 def _flattenize_inputs(inputs):
														
--- a/aphrodite/modeling/model_loader/tensorizer.py
+++ b/aphrodite/modeling/model_loader/tensorizer.py
@@ -13,6 +13,7 @@ from loguru import logger
 
															 from torch import nn
														
 
															 from transformers import PretrainedConfig
														
 
															+from aphrodite import envs
														
 
															 from aphrodite.common.config import ModelConfig, ParallelConfig
														
 
															 from aphrodite.engine.aphrodite_engine import AphroditeEngine
														
 
															 from aphrodite.engine.args_tools import EngineArgs
														
@@ -148,12 +149,12 @@ class TensorizerArgs:
 
															     def __post_init__(self):
														
 
															         self.file_obj = self.tensorizer_uri
														
 
															         self.s3_access_key_id = (self.s3_access_key_id
														
 
															-                                 or os.environ.get("S3_ACCESS_KEY_ID")) or None
														
 
															+                                 or envs.S3_ACCESS_KEY_ID) or None
														
 
															         self.s3_secret_access_key = (
														
 
															             self.s3_secret_access_key
														
 
															-            or os.environ.get("S3_SECRET_ACCESS_KEY")) or None
														
 
															+            or envs.S3_SECRET_ACCESS_KEY) or None
														
 
															         self.s3_endpoint = (self.s3_endpoint
														
 
															-                            or os.environ.get("S3_ENDPOINT_URL")) or None
														
 
															+                            or envs.S3_ENDPOINT_URL) or None
														
 
															         self.stream_params = {
														
 
															             "s3_access_key_id": self.s3_access_key_id,
														
 
															             "s3_secret_access_key": self.s3_secret_access_key,
														
--- a/aphrodite/multimodal/utils.py
+++ b/aphrodite/multimodal/utils.py
@@ -1,5 +1,4 @@
 
															 import base64
														
 
															-import os
														
 
															 from io import BytesIO
														
 
															 from typing import Tuple, Union
														
@@ -8,14 +7,13 @@ import numpy as np
 
															 import soundfile
														
 
															 from PIL import Image
														
 
															+from aphrodite import envs
														
 
															 from aphrodite.common.connections import global_http_connection
														
 
															 from aphrodite.multimodal.base import MultiModalDataDict
														
 
															-APHRODITE_IMAGE_FETCH_TIMEOUT = int(
														
 
															-    os.getenv("APHRODITE_IMAGE_FETCH_TIMEOUT", 10))
														
 
															+APHRODITE_IMAGE_FETCH_TIMEOUT = envs.APHRODITE_IMAGE_FETCH_TIMEOUT
														
 
															-APHRODITE_AUDIO_FETCH_TIMEOUT = int(
														
 
															-    os.getenv("APHRODITE_AUDIO_FETCH_TIMEOUT", 10))
														
 
															+APHRODITE_AUDIO_FETCH_TIMEOUT = envs.APHRODITE_AUDIO_FETCH_TIMEOUT
														
 
															 def _load_image_from_bytes(b: bytes):
														
--- a/aphrodite/plugins/__init__.py
+++ b/aphrodite/plugins/__init__.py
@@ -1,9 +1,7 @@
 
															-import os
														
 
															-
														
 
															 from loguru import logger
														
 
															-APHRODITE_PLUGINS = None if "APHRODITE_PLUGINS" not in os.environ else \
														
 
															-    os.environ["APHRODITE_PLUGINS"].split(",")
														
 
															+from aphrodite import envs
														
 
															+
														
 
															 def load_general_plugins():
														
 
															     """WARNING: plugins can be loaded for multiple times in different
														
@@ -16,7 +14,7 @@ def load_general_plugins():
 
															     else:
														
 
															         from importlib.metadata import entry_points
														
 
															-    allowed_plugins = APHRODITE_PLUGINS
														
 
															+    allowed_plugins = envs.APHRODITE_PLUGINS
														
 
															     discovered_plugins = entry_points(group='aphrodite.general_plugins')
														
 
															     for plugin in discovered_plugins:
														
--- a/aphrodite/quantization/fp8.py
+++ b/aphrodite/quantization/fp8.py
@@ -1,4 +1,3 @@
 
															-import os
														
 
															 from typing import Any, Dict, List, Optional
														
 
															 import torch
														
@@ -7,6 +6,7 @@ from torch.nn import Module
 
															 from torch.nn.parameter import Parameter
														
 
															 from aphrodite import _custom_ops as ops
														
 
															+from aphrodite import envs
														
 
															 from aphrodite.common.utils import is_hip, print_warning_once
														
 
															 from aphrodite.modeling.layers.fused_moe import FusedMoE, FusedMoEMethodBase
														
 
															 from aphrodite.modeling.layers.linear import (LinearBase, LinearMethodBase,
														
@@ -26,8 +26,7 @@ from aphrodite.quantization.utils.w8a8_utils import (
 
															     requantize_with_max_scale)
														
 
															 ACTIVATION_SCHEMES = ["static", "dynamic"]
														
 
															-APHRODITE_TEST_FORCE_FP8_MARLIN = os.environ.get(
														
 
															-    "APHRODITE_TEST_FORCE_FP8_MARLIN", "0").strip().lower() in ("1", "true")
														
 
															+APHRODITE_TEST_FORCE_FP8_MARLIN = envs.APHRODITE_TEST_FORCE_FP8_MARLIN
														
 
															 class Fp8Config(QuantizationConfig):
														
--- a/aphrodite/server/launch.py
+++ b/aphrodite/server/launch.py
@@ -1,5 +1,4 @@
 
															 import asyncio
														
 
															-import os
														
 
															 import signal
														
 
															 from http import HTTPStatus
														
 
															 from typing import Any
														
@@ -8,12 +7,13 @@ import uvicorn
 
															 from fastapi import FastAPI, Response
														
 
															 from loguru import logger
														
 
															+from aphrodite import envs
														
 
															 from aphrodite.common.utils import find_process_using_port, in_windows
														
 
															 from aphrodite.engine.async_aphrodite import AsyncEngineDeadError
														
 
															 from aphrodite.engine.protocol import AsyncEngineClient
														
 
															-APHRODITE_KEEP_ALIVE_ON_ENGINE_DEATH = bool(os.getenv(
														
 
															-    "APHRODITE_KEEP_ALIVE_ON_ENGINE_DEATH", 0))
														
 
															+APHRODITE_KEEP_ALIVE_ON_ENGINE_DEATH = (
														
 
															+    envs.APHRODITE_KEEP_ALIVE_ON_ENGINE_DEATH)
														
 
															 async def serve_http(app: FastAPI, engine: AsyncEngineClient,
														
--- a/aphrodite/task_handler/cpu_worker.py
+++ b/aphrodite/task_handler/cpu_worker.py
@@ -1,10 +1,10 @@
 
															 """A CPU worker class."""
														
 
															-import os
														
 
															 from typing import Dict, List, Optional, Tuple
														
 
															 import torch
														
 
															 import torch.distributed
														
 
															+from aphrodite import envs
														
 
															 from aphrodite.attention import get_attn_backend
														
 
															 from aphrodite.common.config import (CacheConfig, DeviceConfig, LoadConfig,
														
 
															                                      LoRAConfig, ModelConfig, ParallelConfig,
														
@@ -19,8 +19,7 @@ from aphrodite.task_handler.worker_base import (LocalOrDistributedWorkerBase,
 
															                                                 LoraNotSupportedWorkerBase,
														
 
															                                                 WorkerInput)
														
 
															-APHRODITE_CPU_OMP_THREADS_BIND = os.getenv("APHRODITE_CPU_OMP_THREADS_BIND",
														
 
															-                                           "all")
														
 
															+APHRODITE_CPU_OMP_THREADS_BIND = envs.APHRODITE_CPU_OMP_THREADS_BIND
														
 
															 class CPUCacheEngine:
														
--- a/aphrodite/task_handler/tpu_worker.py
+++ b/aphrodite/task_handler/tpu_worker.py
@@ -5,6 +5,7 @@ import torch
 
															 import torch_xla.core.xla_model as xm
														
 
															 import torch_xla.runtime as xr
														
 
															+from aphrodite import envs
														
 
															 from aphrodite.common.config import (CacheConfig, DeviceConfig, LoadConfig,
														
 
															                                      ModelConfig, ParallelConfig,
														
 
															                                      SchedulerConfig)
														
@@ -99,8 +100,7 @@ class TPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
 
															         # Use persistent cache to avoid XLA recompilation.
														
 
															         # NOTE: Set per-rank cache path since different ranks
														
 
															         # can have slightly different XLA graphs.
														
 
															-        APHRODITE_XLA_CACHE_PATH = os.getenv("APHRODITE_XLA_CACHE_PATH",
														
 
															-                                             "~/.aphrodite/xla_cache/")
														
 
															+        APHRODITE_XLA_CACHE_PATH = envs.APHRODITE_XLA_CACHE_PATH
														
 
															         world_size = self.parallel_config.world_size
														
 
															         per_rank_path = os.path.join(APHRODITE_XLA_CACHE_PATH,
														
 
															                                      f"tp{world_size}_rank{self.rank}")
														
--- a/aphrodite/transformers_utils/config.py
+++ b/aphrodite/transformers_utils/config.py
@@ -1,7 +1,6 @@
 
															 import contextlib
														
 
															 import enum
														
 
															 import json
														
 
															-import os
														
 
															 from pathlib import Path
														
 
															 from typing import Any, Dict, Optional, Type, Union
														
@@ -14,6 +13,7 @@ from transformers.models.auto.modeling_auto import (
 
															     MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)
														
 
															 from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME
														
 
															+from aphrodite import envs
														
 
															 from aphrodite.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
														
 
															                                                   InternVLChatConfig,
														
 
															                                                   JAISConfig, MedusaConfig,
														
@@ -21,7 +21,7 @@ from aphrodite.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
 
															                                                   MPTConfig, RWConfig)
														
 
															 from aphrodite.transformers_utils.utils import check_gguf_file
														
 
															-APHRODITE_USE_MODELSCOPE = os.getenv("APHRODITE_USE_MODELSCOPE", "0") == "1"
														
 
															+APHRODITE_USE_MODELSCOPE = envs.APHRODITE_USE_MODELSCOPE
														
 
															 if APHRODITE_USE_MODELSCOPE:
														
 
															     from modelscope import AutoConfig
														
--- a/examples/tensorize_aphrodite_model.py
+++ b/examples/tensorize_aphrodite_model.py
@@ -1,10 +1,9 @@
 
															 import argparse
														
 
															 import dataclasses
														
 
															 import json
														
 
															-import os
														
 
															 import uuid
														
 
															-from aphrodite import LLM
														
 
															+from aphrodite import LLM, envs
														
 
															 from aphrodite.engine.args_tools import EngineArgs
														
 
															 from aphrodite.modeling.model_loader.tensorizer import (
														
 
															     TensorizerArgs, TensorizerConfig, tensorize_aphrodite_model)
														
@@ -177,11 +176,11 @@ if __name__ == '__main__':
 
															     args = parse_args()
														
 
															     s3_access_key_id = (getattr(args, 's3_access_key_id', None)
														
 
															-                        or os.environ.get("S3_ACCESS_KEY_ID", None))
														
 
															+                        or envs.S3_ACCESS_KEY_ID)
														
 
															     s3_secret_access_key = (getattr(args, 's3_secret_access_key', None)
														
 
															-                            or os.environ.get("S3_SECRET_ACCESS_KEY", None))
														
 
															+                            or envs.S3_SECRET_ACCESS_KEY)
														
 
															     s3_endpoint = (getattr(args, 's3_endpoint', None)
														
 
															-                or os.environ.get("S3_ENDPOINT_URL", None))
														
 
															+                or envs.S3_ENDPOINT_URL)
														
 
															     credentials = {
														
 
															         "s3_access_key_id": s3_access_key_id,
														
--- a/setup.py
+++ b/setup.py
@@ -1,3 +1,4 @@
 
															+import importlib.util
														
 
															 import io
														
 
															 import logging
														
 
															 import os
														
@@ -14,10 +15,16 @@ from setuptools import Extension, find_packages, setup
 
															 from setuptools.command.build_ext import build_ext
														
 
															 from torch.utils.cpp_extension import CUDA_HOME
														
 
															+
														
 
															+def load_module_from_path(module_name, path):
														
 
															+    spec = importlib.util.spec_from_file_location(module_name, path)
														
 
															+    module = importlib.util.module_from_spec(spec)
														
 
															+    sys.modules[module_name] = module
														
 
															+    spec.loader.exec_module(module)
														
 
															+    return module
														
 
															+
														
 
															 ROOT_DIR = os.path.dirname(__file__)
														
 
															 logger = logging.getLogger(__name__)
														
 
															-# Target device of Aphrodite, supporting [cuda (by default), rocm, neuron, cpu]
														
 
															-APHRODITE_TARGET_DEVICE = os.getenv("APHRODITE_TARGET_DEVICE", "cuda")
														
 
															 def embed_commit_hash():
														
@@ -47,6 +54,14 @@ def embed_commit_hash():
 
															 embed_commit_hash()
														
 
															+
														
 
															+# cannot import envs directly because it depends on aphrodite,
														
 
															+#  which is not installed yet
														
 
															+envs = load_module_from_path('envs', os.path.join(
														
 
															+    ROOT_DIR, 'aphrodite', 'envs.py'))
														
 
															+
														
 
															+APHRODITE_TARGET_DEVICE = envs.APHRODITE_TARGET_DEVICE
														
 
															+
														
 
															 if not sys.platform.startswith("linux"):
														
 
															     logger.warning(
														
 
															         "Aphrodite only supports Linux platform (including WSL). "
														
@@ -97,7 +112,7 @@ class cmake_build_ext(build_ext):
 
															     def compute_num_jobs(self):
														
 
															         # `num_jobs` is either the value of the MAX_JOBS environment variable
														
 
															         # (if defined) or the number of CPUs available.
														
 
															-        num_jobs = os.environ.get("MAX_JOBS", None)
														
 
															+        num_jobs = envs.MAX_JOBS
														
 
															         if num_jobs is not None:
														
 
															             num_jobs = int(num_jobs)
														
 
															             logger.info(f"Using MAX_JOBS={num_jobs} as the number of jobs.")
														
@@ -118,7 +133,7 @@ class cmake_build_ext(build_ext):
 
															             # environment variable (if defined) or 1.
														
 
															             # when it is set, we reduce `num_jobs` to avoid
														
 
															             # overloading the system.
														
 
															-            nvcc_threads = os.getenv("NVCC_THREADS", None)
														
 
															+            nvcc_threads = envs.NVCC_THREADS
														
 
															             if nvcc_threads is not None:
														
 
															                 nvcc_threads = int(nvcc_threads)
														
 
															                 logger.info(f"Using NVCC_THREADS={nvcc_threads} as the number"
														
@@ -143,7 +158,7 @@ class cmake_build_ext(build_ext):
 
															         # Select the build type.
														
 
															         # Note: optimization level + debug info are set by the build type
														
 
															         default_cfg = "Debug" if self.debug else "RelWithDebInfo"
														
 
															-        cfg = os.getenv("CMAKE_BUILD_TYPE", default_cfg)
														
 
															+        cfg = envs.CMAKE_BUILD_TYPE or default_cfg
														
 
															         # where .so files will be written, should be the same for all extensions
														
 
															         # that use the same CMakeLists.txt.
														
@@ -161,7 +176,7 @@ class cmake_build_ext(build_ext):
 
															             '-DAPHRODITE_TARGET_DEVICE={}'.format(APHRODITE_TARGET_DEVICE),
														
 
															         ]
														
 
															-        verbose = bool(int(os.getenv('VERBOSE', '0')))
														
 
															+        verbose = envs.VERBOSE
														
 
															         if verbose:
														
 
															             cmake_args += ['-DCMAKE_VERBOSE_MAKEFILE=ON']
														
@@ -469,7 +484,7 @@ package_data = {
 
															         "py.typed", "modeling/layers/fused_moe/configs/*.json"
														
 
															     ]
														
 
															 }
														
 
															-if os.environ.get("APHRODITE_USE_PRECOMPILED"):
														
 
															+if envs.APHRODITE_USE_PRECOMPILED:
														
 
															     ext_modules = []
														
 
															     package_data["aphrodite"].append("*.so")