vor 3 Monaten · 901900854e
--- a/aphrodite/assets/base.py
+++ b/aphrodite/assets/base.py
@@ -5,6 +5,7 @@ from functools import lru_cache
 
				 from pathlib import Path
			
 
				 from typing import Optional
			
 
				 
			
 
				+from aphrodite import envs
			
 
				 from aphrodite.connections import global_http_connection
			
 
				 
			
 
				 
			
@@ -15,13 +16,8 @@ def get_default_cache_root():
 
				     )
			
 
				 
			
 
				 vLLM_S3_BUCKET_URL = "https://vllm-public-assets.s3.us-west-2.amazonaws.com"
			
 
				-APHRODITE_ASSETS_CACHE = os.path.expanduser(
			
 
				-    os.getenv(
			
 
				-        "APHRODITE_ASSETS_CACHE",
			
 
				-        os.path.join(get_default_cache_root(), "aphrodite", "assets"),
			
 
				-    ))
			
 
				-APHRODITE_IMAGE_FETCH_TIMEOUT = int(os.getenv("APHRODITE_IMAGE_FETCH_TIMEOUT",
			
 
				-                                              5))
			
 
				+APHRODITE_ASSETS_CACHE = envs.APHRODITE_ASSETS_CACHE
			
 
				+APHRODITE_IMAGE_FETCH_TIMEOUT = envs.APHRODITE_IMAGE_FETCH_TIMEOUT
			
 
				 
			
 
				 def get_cache_dir() -> Path:
			
 
				     """Get the path to the cache for storing downloaded assets."""
			
--- a/aphrodite/attention/backends/rocm_flash_attn.py
+++ b/aphrodite/attention/backends/rocm_flash_attn.py
@@ -1,11 +1,11 @@
 
				 """Attention layer ROCm GPUs."""
			
 
				-import os
			
 
				 from dataclasses import dataclass
			
 
				 from typing import Any, Dict, List, Optional, Tuple, Type
			
 
				 
			
 
				 import torch
			
 
				 from loguru import logger
			
 
				 
			
 
				+from aphrodite import envs
			
 
				 from aphrodite.attention.backends.abstract import (AttentionBackend,
			
 
				                                                    AttentionImpl,
			
 
				                                                    AttentionMetadata,
			
@@ -280,9 +280,7 @@ class ROCmFlashAttentionImpl(AttentionImpl):
 
				 
			
 
				         self.use_naive_attn = False
			
 
				         # NOTE: Allow for switching between Triton and CK. Defaulting to triton.
			
 
				-        self.use_triton_flash_attn = (os.environ.get(
			
 
				-            "APHRODITE_USE_TRITON_FLASH_ATTN", "True").lower()
			
 
				-                                      in ("true", "1"))
			
 
				+        self.use_triton_flash_attn = envs.APHRODITE_USE_TRITON_FLASH_ATTN
			
 
				         if self.use_triton_flash_attn:
			
 
				             from aphrodite.attention.ops.triton_flash_attn import (  # noqa: F401
			
 
				                 triton_attention)
			
--- a/aphrodite/attention/selector.py
+++ b/aphrodite/attention/selector.py
@@ -7,12 +7,13 @@ from typing import Generator, Optional, Type
 
				 import torch
			
 
				 from loguru import logger
			
 
				 
			
 
				+from aphrodite import envs
			
 
				 from aphrodite.attention.backends.abstract import AttentionBackend
			
 
				 from aphrodite.common.utils import (STR_BACKEND_ENV_VAR, is_cpu, is_hip,
			
 
				                                     is_openvino, is_xpu)
			
 
				 from aphrodite.platforms import current_platform
			
 
				 
			
 
				-APHRODITE_ATTENTION_BACKEND = os.getenv("APHRODITE_ATTENTION_BACKEND", None)
			
 
				+APHRODITE_ATTENTION_BACKEND = envs.APHRODITE_ATTENTION_BACKEND
			
 
				 
			
 
				 
			
 
				 class _Backend(enum.Enum):
			
--- a/aphrodite/common/config.py
+++ b/aphrodite/common/config.py
@@ -9,6 +9,7 @@ import torch
 
				 from loguru import logger
			
 
				 from transformers import PretrainedConfig
			
 
				 
			
 
				+from aphrodite import envs
			
 
				 from aphrodite.common.utils import (STR_NOT_IMPL_ENC_DEC_CUDAGRAPH, GiB_bytes,
			
 
				                                     cuda_device_count_stateless,
			
 
				                                     get_cpu_memory, is_cpu, is_hip, is_neuron,
			
@@ -30,8 +31,7 @@ if TYPE_CHECKING:
 
				         BaseTokenizerGroup)
			
 
				 
			
 
				 # If true, will load models from ModelScope instead of Hugging Face Hub.
			
 
				-APHRODITE_USE_MODELSCOPE = os.environ.get("APHRODITE_USE_MODELSCOPE",
			
 
				-                                          "False").lower() == "true"
			
 
				+APHRODITE_USE_MODELSCOPE = envs.APHRODITE_USE_MODELSCOPE
			
 
				 
			
 
				 _EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768
			
 
				 
			
@@ -1820,21 +1820,39 @@ def _get_and_verify_max_len(
 
				                     "original_max_position_embeddings"]
			
 
				             derived_max_model_len *= scaling_factor
			
 
				 
			
 
				+    # If the user specified a max length, make sure it is smaller than the
			
 
				+    # derived length from the HF model config.
			
 
				     if max_model_len is None:
			
 
				-        max_model_len = derived_max_model_len
			
 
				-    elif max_model_len > derived_max_model_len and rope_scaling_arg is None:
			
 
				-        raise ValueError(
			
 
				-            f"User-specified max_model_len {max_model_len} is higher than "
			
 
				-            f"the original {derived_max_model_len}. "
			
 
				-            "Please provide a rope_scaling dict to scale the model.")
			
 
				-    elif max_model_len > derived_max_model_len and rope_scaling_arg is not None:
			
 
				-        # hope this works
			
 
				-        logger.warning(
			
 
				-            f"User-specified max_model_len {max_model_len} is higher than "
			
 
				-            f"the original {derived_max_model_len}. "
			
 
				-            "Attempting to use RoPE scaling with the provided rope_scaling "
			
 
				-            "dict.")
			
 
				-        derived_max_model_len = max_model_len
			
 
				+        max_model_len = int(derived_max_model_len)
			
 
				+    elif max_model_len > derived_max_model_len:
			
 
				+        # Some models might have a separate key for specifying model_max_length
			
 
				+        # that will be bigger than derived_max_model_len. We compare user input
			
 
				+        # with model_max_length and allow this override when it's smaller.
			
 
				+        model_max_length = getattr(hf_config, "model_max_length", None)
			
 
				+        if envs.APHRODITE_DYNAMIC_ROPE_SCALING:
			
 
				+            scaling_factor = max_model_len / derived_max_model_len
			
 
				+            hf_config.rope_scaling = {"factor": scaling_factor,
			
 
				+                                      "type": "dynamic"}
			
 
				+            logger.info(
			
 
				+                "Using dynamic RoPE scaling to extend the model's max context "
			
 
				+                f"length from {derived_max_model_len} to {max_model_len}.")
			
 
				+            derived_max_model_len = max_model_len
			
 
				+        elif model_max_length is not None and max_model_len <= model_max_length:
			
 
				+            if disable_sliding_window:
			
 
				+                # TODO: Find a model that has model_max_length
			
 
				+                # with sliding window to see if this case should be allowed.
			
 
				+                raise NotImplementedError(
			
 
				+                    "Disabling sliding window is not supported for models "
			
 
				+                    "model_max_length in the config. Please raise an issue "
			
 
				+                    "so we can investigate.")
			
 
				+        else:
			
 
				+            raise ValueError(
			
 
				+                f"User-specified max_model_len ({max_model_len}) is greater "
			
 
				+                f"than the derived max_model_len ({max_len_key}="
			
 
				+                f"{derived_max_model_len} or model_max_length="
			
 
				+                f"{model_max_length} in model's config.json). To allow "
			
 
				+                "greater lengths, please set the env var "
			
 
				+                "APHRODITE_DYNAMIC_ROPE_SCALING=1")
			
 
				 
			
 
				     return int(max_model_len)
			
 
				 
			
--- a/aphrodite/common/logger.py
+++ b/aphrodite/common/logger.py
@@ -15,11 +15,12 @@ from rich.markup import escape
 
				 from rich.progress import (BarColumn, MofNCompleteColumn, Progress,
			
 
				                            TaskProgressColumn, TextColumn, TimeRemainingColumn)
			
 
				 
			
 
				+from aphrodite import envs
			
 
				+
			
 
				 RICH_CONSOLE = Console()
			
 
				 LOG_LEVEL = os.getenv("APHRODITE_LOG_LEVEL", "INFO").upper()
			
 
				 
			
 
				-APHRODITE_CONFIGURE_LOGGING = int(os.getenv("APHRODITE_CONFIGURE_LOGGING",
			
 
				-                                            "1"))
			
 
				+APHRODITE_CONFIGURE_LOGGING = envs.APHRODITE_CONFIGURE_LOGGING
			
 
				 
			
 
				 
			
 
				 def unwrap(wrapped, default=None):
			
--- a/aphrodite/common/sampling_params.py
+++ b/aphrodite/common/sampling_params.py
@@ -1,6 +1,5 @@
 
				 """Sampling parameters for text generation."""
			
 
				 import copy
			
 
				-import os
			
 
				 from enum import IntEnum
			
 
				 from functools import cached_property
			
 
				 from typing import Any, Callable, Dict, List, Optional, Set, Union
			
@@ -10,11 +9,12 @@ import torch
 
				 from loguru import logger
			
 
				 from typing_extensions import Annotated
			
 
				 
			
 
				+from aphrodite import envs
			
 
				+
			
 
				 _SAMPLING_EPS = 1e-5
			
 
				 _MAX_TEMP = 1e-2
			
 
				 
			
 
				-APHRODITE_NO_DEPRECATION_WARNING = bool(
			
 
				-    int(os.environ.get("APHRODITE_NO_DEPRECATION_WARNING", "0")))
			
 
				+APHRODITE_NO_DEPRECATION_WARNING = envs.APHRODITE_NO_DEPRECATION_WARNING
			
 
				 
			
 
				 
			
 
				 class SamplingType(IntEnum):
			
--- a/aphrodite/common/utils.py
+++ b/aphrodite/common/utils.py
@@ -31,6 +31,7 @@ from rich.progress import (BarColumn, MofNCompleteColumn, Progress,
 
				                            SpinnerColumn, TextColumn, TimeElapsedColumn)
			
 
				 from typing_extensions import ParamSpec, TypeIs, assert_never
			
 
				 
			
 
				+from aphrodite import envs
			
 
				 from aphrodite.common.logger import enable_trace_function_call
			
 
				 from aphrodite.distributed import get_tensor_model_parallel_rank
			
 
				 
			
@@ -382,8 +383,7 @@ def get_aphrodite_instance_id():
 
				     Instance id represents an instance of the Aphrodite. All processes in the
			
 
				     same instance should have the same instance id.
			
 
				     """
			
 
				-    return os.environ.get("APHRODITE_INSTANCE_ID",
			
 
				-                          f"aphrodite-instance-{random_uuid()}")
			
 
				+    return envs.APHRODITE_INSTANCE_ID or f"aphrodite-instance-{random_uuid()}"
			
 
				 
			
 
				 
			
 
				 @lru_cache(maxsize=None)
			
@@ -520,9 +520,7 @@ def get_distributed_init_method(ip: str, port: int) -> str:
 
				 
			
 
				 def get_open_zmq_ipc_path() -> str:
			
 
				     if not in_windows():
			
 
				-        APHRODITE_RPC_BASE_PATH = os.getenv("APHRODITE_RPC_BASE_PATH",
			
 
				-                                        tempfile.gettempdir())
			
 
				-        base_rpc_path = APHRODITE_RPC_BASE_PATH
			
 
				+        base_rpc_path = envs.APHRODITE_RPC_BASE_PATH
			
 
				         return f"ipc://{base_rpc_path}/{uuid4()}"
			
 
				     else:
			
 
				         # windows doesn't support ipc://
			
@@ -530,8 +528,7 @@ def get_open_zmq_ipc_path() -> str:
 
				         return f"tcp://127.0.0.1:{get_open_port()}"
			
 
				      
			
 
				 def get_open_port(port: Optional[int] = None) -> int:
			
 
				-    port = int(os.getenv("APHRODITE_PORT", 0)
			
 
				-                ) if "APHRODITE_PORT" in os.environ else None
			
 
				+    port = envs.APHRODITE_PORT
			
 
				     if port is not None:
			
 
				         while True:
			
 
				             try:
			
@@ -948,7 +945,7 @@ def find_library(lib_name: str) -> str:
 
				     # libcuda.so.1 (libc6,x86-64) => /lib/x86_64-linux-gnu/libcuda.so.1
			
 
				     locs = [line.split()[-1] for line in libs.splitlines() if lib_name in line]
			
 
				     # `LD_LIBRARY_PATH` searches the library in the user-defined paths
			
 
				-    env_ld_library_path = os.getenv("LD_LIBRARY_PATH")
			
 
				+    env_ld_library_path = envs.LD_LIBRARY_PATH
			
 
				     if not locs and env_ld_library_path:
			
 
				         locs = [
			
 
				             os.path.join(dir, lib_name)
			
@@ -967,7 +964,7 @@ def find_nccl_library() -> str:
 
				     After importing `torch`, `libnccl.so.2` or `librccl.so.1` can be
			
 
				     found by `ctypes` automatically.
			
 
				     """
			
 
				-    so_file = os.environ.get("APHRODITE_NCCL_SO_PATH", "")
			
 
				+    so_file = envs.APHRODITE_NCCL_SO_PATH
			
 
				 
			
 
				     # manually load the nccl library
			
 
				     if so_file:
			
@@ -985,7 +982,7 @@ def find_nccl_library() -> str:
 
				 
			
 
				 
			
 
				 def enable_trace_function_call_for_thread() -> None:
			
 
				-    if int(os.getenv("APHRODITE_TRACE_FUNCTION", "0")):
			
 
				+    if envs.APHRODITE_TRACE_FUNCTION:
			
 
				         tmp_dir = tempfile.gettempdir()
			
 
				         filename = (f"APHRODITE_TRACE_FUNCTION_for_process_{os.getpid()}"
			
 
				                     f"_thread_{threading.get_ident()}_"
			
@@ -1074,7 +1071,7 @@ def cuda_device_count_stateless() -> int:
 
				     # This can be removed and simply replaced with torch.cuda.get_device_count
			
 
				     # after https://github.com/pytorch/pytorch/pull/122815 is released.
			
 
				 
			
 
				-    return _cuda_device_count_stateless(os.environ.get("CUDA_VISIBLE_DEVICES"))
			
 
				+    return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES)
			
 
				 
			
 
				 
			
 
				 #From: https://stackoverflow.com/a/4104188/2749989
			
--- a/aphrodite/distributed/device_communicators/custom_all_reduce.py
+++ b/aphrodite/distributed/device_communicators/custom_all_reduce.py
@@ -1,4 +1,3 @@
 
				-import os
			
 
				 from contextlib import contextmanager
			
 
				 from typing import Any, List, Optional, Union
			
 
				 
			
@@ -8,6 +7,7 @@ from loguru import logger
 
				 from torch.distributed import ProcessGroup
			
 
				 
			
 
				 from aphrodite import _custom_ops as ops
			
 
				+from aphrodite import envs
			
 
				 from aphrodite.common.utils import cuda_device_count_stateless
			
 
				 from aphrodite.distributed.device_communicators.custom_all_reduce_utils import (
			
 
				     gpu_p2p_access_check)
			
@@ -95,7 +95,7 @@ class CustomAllreduce:
 
				         assert isinstance(device, torch.device)
			
 
				         self.device = device
			
 
				 
			
 
				-        cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
			
 
				+        cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
			
 
				         if cuda_visible_devices:
			
 
				             device_ids = list(map(int, cuda_visible_devices.split(",")))
			
 
				         else:
			
--- a/aphrodite/distributed/device_communicators/custom_all_reduce_utils.py
+++ b/aphrodite/distributed/device_communicators/custom_all_reduce_utils.py
@@ -11,6 +11,7 @@ import torch.distributed as dist
 
				 import torch.multiprocessing as mp
			
 
				 from loguru import logger
			
 
				 
			
 
				+from aphrodite import envs
			
 
				 from aphrodite.common.utils import (cuda_device_count_stateless,
			
 
				                                     update_environment_variables)
			
 
				 from aphrodite.distributed.device_communicators.cuda_wrapper import (
			
@@ -124,7 +125,7 @@ def can_actually_p2p(
 
				     processes for testing all pairs of GPUs in batch. The trick is to reset
			
 
				     the device after each test (which is not available in PyTorch).
			
 
				     """  # noqa
			
 
				-    cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None)
			
 
				+    cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
			
 
				     # pass the CUDA_VISIBLE_DEVICES to the child process
			
 
				     # to make sure they see the same set of GPUs
			
 
				 
			
@@ -183,13 +184,13 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
 
				     is_distributed = dist.is_initialized()
			
 
				 
			
 
				     num_dev = cuda_device_count_stateless()
			
 
				-    cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None)
			
 
				+    cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
			
 
				     if cuda_visible_devices is None:
			
 
				         cuda_visible_devices = ",".join(str(i) for i in range(num_dev))
			
 
				-    APHRODITE_CONFIG_ROOT = os.getenv("APHRODITE_CONFIG_ROOT", "~/.config")
			
 
				-    path = os.path.expanduser(
			
 
				-        f"{APHRODITE_CONFIG_ROOT}/aphrodite/gpu_p2p_access_cache_for_{cuda_visible_devices}.json"
			
 
				-    )
			
 
				+
			
 
				+    path = os.path.join(
			
 
				+        envs.APHRODITE_CACHE_ROOT,
			
 
				+        f"gpu_p2p_access_cache_for_{cuda_visible_devices}.json")
			
 
				     os.makedirs(os.path.dirname(path), exist_ok=True)
			
 
				     from aphrodite.distributed.parallel_state import get_world_group
			
 
				     if ((not is_distributed or get_world_group().local_rank == 0)
			
--- a/aphrodite/distributed/device_communicators/shm_broadcast.py
+++ b/aphrodite/distributed/device_communicators/shm_broadcast.py
@@ -1,4 +1,3 @@
 
				-import os
			
 
				 import pickle
			
 
				 import time
			
 
				 from contextlib import contextmanager
			
@@ -13,10 +12,11 @@ from loguru import logger
 
				 from torch.distributed import ProcessGroup
			
 
				 from zmq import SUB, SUBSCRIBE, XPUB, XPUB_VERBOSE, Context  # type: ignore
			
 
				 
			
 
				+from aphrodite import envs
			
 
				 from aphrodite.common.utils import get_ip, get_open_port
			
 
				 
			
 
				-APHRODITE_RINGBUFFER_WARNING_INTERVAL = os.getenv(
			
 
				-    "APHRODITE_RINGBUFFER_WARNING_INTERVAL", 60)
			
 
				+APHRODITE_RINGBUFFER_WARNING_INTERVAL = (
			
 
				+    envs.APHRODITE_RINGBUFFER_WARNING_INTERVAL)
			
 
				 
			
 
				 # time to wait if the queue is full or empty
			
 
				 # if we sleep for too short, it will consume too much CPU
			
--- a/aphrodite/distributed/parallel_state.py
+++ b/aphrodite/distributed/parallel_state.py
@@ -21,7 +21,6 @@ If you only need to use the distributed environment without model/pipeline
 
				  steps.
			
 
				 """
			
 
				 import contextlib
			
 
				-import os
			
 
				 import pickle
			
 
				 import sys
			
 
				 from collections import namedtuple
			
@@ -36,6 +35,8 @@ import torch.distributed
 
				 from loguru import logger
			
 
				 from torch.distributed import Backend, ProcessGroup
			
 
				 
			
 
				+from aphrodite import envs
			
 
				+
			
 
				 
			
 
				 @dataclass
			
 
				 class GraphCaptureContext:
			
@@ -866,7 +867,7 @@ def init_distributed_environment(
 
				         # local rank not set, this usually happens in single-node
			
 
				         # setting, where we can use rank as local rank
			
 
				         if distributed_init_method == "env://":
			
 
				-            local_rank = os.getenv("LOCAL_RANK", rank)
			
 
				+            local_rank = envs.LOCAL_RANK
			
 
				         else:
			
 
				             local_rank = rank
			
 
				     global _WORLD
			
--- a/aphrodite/distributed/utils.py
+++ b/aphrodite/distributed/utils.py
@@ -3,12 +3,13 @@
 
				 # Adapted from
			
 
				 # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py
			
 
				 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
			
 
				-import os
			
 
				 from typing import Sequence, Tuple
			
 
				 
			
 
				 import torch
			
 
				 
			
 
				-APHRODITE_PP_LAYER_PARTITION = os.getenv("APHRODITE_PP_LAYER_PARTITION", None)
			
 
				+from aphrodite import envs
			
 
				+
			
 
				+APHRODITE_PP_LAYER_PARTITION = envs.APHRODITE_PP_LAYER_PARTITION
			
 
				 
			
 
				 
			
 
				 def ensure_divisibility(numerator, denominator):
			
--- a/aphrodite/endpoints/openai/api_server.py
+++ b/aphrodite/endpoints/openai/api_server.py
@@ -20,6 +20,7 @@ from fastapi.responses import (HTMLResponse, JSONResponse, Response,
 
				 from loguru import logger
			
 
				 from starlette.routing import Mount
			
 
				 
			
 
				+from aphrodite import envs
			
 
				 from aphrodite.common.config import ModelConfig
			
 
				 from aphrodite.common.outputs import RequestOutput
			
 
				 from aphrodite.common.sampling_params import _SAMPLING_EPS, SamplingParams
			
@@ -635,7 +636,7 @@ def build_app(args: Namespace) -> FastAPI:
 
				         return JSONResponse(err.model_dump(),
			
 
				                             status_code=HTTPStatus.BAD_REQUEST)
			
 
				 
			
 
				-    if token := os.environ.get("APHRODITE_API_KEY") or args.api_keys:
			
 
				+    if token := envs.APHRODITE_API_KEY or args.api_keys:
			
 
				         admin_key = os.environ.get("APHRODITE_ADMIN_KEY") or args.admin_key
			
 
				 
			
 
				         if admin_key is None:
			
--- a/aphrodite/engine/aphrodite_engine.py
+++ b/aphrodite/engine/aphrodite_engine.py
@@ -1,4 +1,3 @@
 
				-import os
			
 
				 import time
			
 
				 from contextlib import contextmanager
			
 
				 from typing import TYPE_CHECKING, Any, ClassVar, Dict, Iterable, List, Optional
			
@@ -9,6 +8,7 @@ from loguru import logger
 
				 from transformers import PreTrainedTokenizer
			
 
				 from typing_extensions import assert_never
			
 
				 
			
 
				+from aphrodite import envs
			
 
				 from aphrodite.common.config import (CacheConfig, DecodingConfig, DeviceConfig,
			
 
				                                      EngineConfig, LoadConfig, LoRAConfig,
			
 
				                                      ModelConfig, ParallelConfig,
			
@@ -50,8 +50,7 @@ from aphrodite.version import __version__ as APHRODITE_VERSION
 
				 
			
 
				 _LOCAL_LOGGING_INTERVAL_SEC = 5
			
 
				 
			
 
				-APHRODITE_USE_RAY_SPMD_WORKER = bool(
			
 
				-    os.getenv("APHRODITE_USE_RAY_SPMD_WORKER", 0))
			
 
				+APHRODITE_USE_RAY_SPMD_WORKER = envs.APHRODITE_USE_RAY_SPMD_WORKER
			
 
				 
			
 
				 
			
 
				 def _load_generation_config_dict(model_config: ModelConfig) -> Dict[str, Any]:
			
--- a/aphrodite/engine/args_tools.py
+++ b/aphrodite/engine/args_tools.py
@@ -1,13 +1,13 @@
 
				 import argparse
			
 
				 import dataclasses
			
 
				 import json
			
 
				-import os
			
 
				 from dataclasses import dataclass
			
 
				 from typing import (TYPE_CHECKING, Dict, List, Mapping, Optional, Tuple, Type,
			
 
				                     Union)
			
 
				 
			
 
				 from loguru import logger
			
 
				 
			
 
				+from aphrodite import envs
			
 
				 from aphrodite.common.config import (CacheConfig, ConfigFormat, DecodingConfig,
			
 
				                                      DeviceConfig, EngineConfig, LoadConfig,
			
 
				                                      LoadFormat, LoRAConfig, ModelConfig,
			
@@ -24,8 +24,7 @@ if TYPE_CHECKING:
 
				     from aphrodite.transformers_utils.tokenizer_group import BaseTokenizerGroup
			
 
				 
			
 
				 
			
 
				-APHRODITE_USE_RAY_SPMD_WORKER = bool(
			
 
				-    os.getenv("APHRODITE_USE_RAY_SPMD_WORKER", 0))
			
 
				+APHRODITE_USE_RAY_SPMD_WORKER = envs.APHRODITE_USE_RAY_SPMD_WORKER
			
 
				 
			
 
				 def nullable_kvs(val: str) -> Optional[Mapping[str, int]]:
			
 
				     if len(val) == 0:
			
--- a/aphrodite/engine/async_aphrodite.py
+++ b/aphrodite/engine/async_aphrodite.py
@@ -1,5 +1,4 @@
 
				 import asyncio
			
 
				-import os
			
 
				 import time
			
 
				 from dataclasses import dataclass
			
 
				 from functools import partial
			
@@ -11,6 +10,7 @@ from loguru import logger
 
				 from transformers import PreTrainedTokenizer
			
 
				 from typing_extensions import assert_never
			
 
				 
			
 
				+from aphrodite import envs
			
 
				 from aphrodite.common.config import (DecodingConfig, EngineConfig, LoRAConfig,
			
 
				                                      ModelConfig, ParallelConfig,
			
 
				                                      SchedulerConfig)
			
@@ -34,8 +34,7 @@ from aphrodite.lora.request import LoRARequest
 
				 from aphrodite.processing.scheduler import SchedulerOutputs
			
 
				 from aphrodite.prompt_adapter.request import PromptAdapterRequest
			
 
				 
			
 
				-ENGINE_ITERATION_TIMEOUT_S = int(
			
 
				-    os.environ.get("APHRODITE_ENGINE_ITERATION_TIMEOUT_S", "60"))
			
 
				+ENGINE_ITERATION_TIMEOUT_S = envs.APHRODITE_ENGINE_ITERATION_TIMEOUT_S
			
 
				 
			
 
				 
			
 
				 class AsyncEngineDeadError(RuntimeError):
			
--- a/aphrodite/envs.py
+++ b/aphrodite/envs.py
@@ -0,0 +1,388 @@
 
				+import os
			
 
				+import tempfile
			
 
				+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
			
 
				+
			
 
				+if TYPE_CHECKING:
			
 
				+    APHRODITE_HOST_IP: str = ""
			
 
				+    APHRODITE_PORT: Optional[int] = None
			
 
				+    APHRODITE_RPC_BASE_PATH: str = tempfile.gettempdir()
			
 
				+    APHRODITE_USE_MODELSCOPE: bool = False
			
 
				+    APHRODITE_RINGBUFFER_WARNING_INTERVAL: int = 60
			
 
				+    APHRODITE_INSTANCE_ID: Optional[str] = None
			
 
				+    APHRODITE_NCCL_SO_PATH: Optional[str] = None
			
 
				+    LD_LIBRARY_PATH: Optional[str] = None
			
 
				+    APHRODITE_USE_TRITON_FLASH_ATTN: bool = False
			
 
				+    LOCAL_RANK: int = 0
			
 
				+    CUDA_VISIBLE_DEVICES: Optional[str] = None
			
 
				+    APHRODITE_ENGINE_ITERATION_TIMEOUT_S: int = 60
			
 
				+    APHRODITE_API_KEY: Optional[str] = None
			
 
				+    S3_ACCESS_KEY_ID: Optional[str] = None
			
 
				+    S3_SECRET_ACCESS_KEY: Optional[str] = None
			
 
				+    S3_ENDPOINT_URL: Optional[str] = None
			
 
				+    APHRODITE_CACHE_ROOT: str = os.path.expanduser("~/.cache/aphrodite")
			
 
				+    APHRODITE_CONFIG_ROOT: str = os.path.expanduser("~/.config/aphrodite")
			
 
				+    APHRODITE_CONFIGURE_LOGGING: int = 1
			
 
				+    APHRODITE_LOGGING_LEVEL: str = "INFO"
			
 
				+    APHRODITE_LOGGING_CONFIG_PATH: Optional[str] = None
			
 
				+    APHRODITE_TRACE_FUNCTION: int = 0
			
 
				+    APHRODITE_ATTENTION_BACKEND: Optional[str] = None
			
 
				+    APHRODITE_USE_SAMPLING_KERNELS: bool = False
			
 
				+    APHRODITE_PP_LAYER_PARTITION: Optional[str] = None
			
 
				+    APHRODITE_CPU_KVCACHE_SPACE: int = 0
			
 
				+    APHRODITE_CPU_OMP_THREADS_BIND: str = ""
			
 
				+    APHRODITE_OPENVINO_KVCACHE_SPACE: int = 0
			
 
				+    APHRODITE_OPENVINO_CPU_KV_CACHE_PRECISION: Optional[str] = None
			
 
				+    APHRODITE_OPENVINO_ENABLE_QUANTIZED_WEIGHTS: bool = False
			
 
				+    APHRODITE_XLA_CACHE_PATH: str = os.path.join(APHRODITE_CACHE_ROOT, "xla_cache")  # noqa: E501
			
 
				+    APHRODITE_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024
			
 
				+    APHRODITE_USE_RAY_SPMD_WORKER: bool = False
			
 
				+    APHRODITE_USE_RAY_COMPILED_DAG: bool = False
			
 
				+    APHRODITE_USE_RAY_COMPILED_DAG_NCCL_CHANNEL: bool = True
			
 
				+    APHRODITE_WORKER_MULTIPROC_METHOD: str = "fork"
			
 
				+    APHRODITE_ASSETS_CACHE: str = os.path.join(APHRODITE_CACHE_ROOT, "assets")
			
 
				+    APHRODITE_IMAGE_FETCH_TIMEOUT: int = 5
			
 
				+    APHRODITE_AUDIO_FETCH_TIMEOUT: int = 5
			
 
				+    APHRODITE_TARGET_DEVICE: str = "cuda"
			
 
				+    MAX_JOBS: Optional[str] = None
			
 
				+    NVCC_THREADS: Optional[str] = None
			
 
				+    APHRODITE_USE_PRECOMPILED: bool = False
			
 
				+    APHRODITE_NO_DEPRECATION_WARNING: bool = False
			
 
				+    APHRODITE_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
			
 
				+    CMAKE_BUILD_TYPE: Optional[str] = None
			
 
				+    VERBOSE: bool = False
			
 
				+    APHRODITE_DYNAMIC_ROPE_SCALING: bool = False
			
 
				+    APHRODITE_TEST_FORCE_FP8_MARLIN: bool = False
			
 
				+    APHRODITE_ALLOW_ENGINE_USE_RAY: bool = False
			
 
				+    APHRODITE_PLUGINS: Optional[List[str]] = None
			
 
				+
			
 
				+
			
 
				+def get_default_cache_root():
			
 
				+    return os.getenv(
			
 
				+        "XDG_CACHE_HOME",
			
 
				+        os.path.join(os.path.expanduser("~"), ".cache"),
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+def get_default_config_root():
			
 
				+    return os.getenv(
			
 
				+        "XDG_CONFIG_HOME",
			
 
				+        os.path.join(os.path.expanduser("~"), ".config"),
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+# The begin-* and end* here are used by the documentation generator
			
 
				+# to extract the used env vars.
			
 
				+
			
 
				+# begin-env-vars-definition
			
 
				+
			
 
				+environment_variables: Dict[str, Callable[[], Any]] = {
			
 
				+
			
 
				+    # ================== Installation Time Env Vars ==================
			
 
				+
			
 
				+    # Target device of Aphrodite, supporting [cuda (by default),
			
 
				+    # rocm, neuron, cpu, openvino]
			
 
				+    "APHRODITE_TARGET_DEVICE":
			
 
				+    lambda: os.getenv("APHRODITE_TARGET_DEVICE", "cuda"),
			
 
				+
			
 
				+    # Maximum number of compilation jobs to run in parallel.
			
 
				+    # By default this is the number of CPUs
			
 
				+    "MAX_JOBS":
			
 
				+    lambda: os.getenv("MAX_JOBS", None),
			
 
				+
			
 
				+    # Number of threads to use for nvcc
			
 
				+    # By default this is 1.
			
 
				+    # If set, `MAX_JOBS` will be reduced to avoid oversubscribing the CPU.
			
 
				+    "NVCC_THREADS":
			
 
				+    lambda: os.getenv("NVCC_THREADS", None),
			
 
				+
			
 
				+    # If set, Aphrodite will use precompiled binaries (*.so)
			
 
				+    "APHRODITE_USE_PRECOMPILED":
			
 
				+    lambda: bool(os.environ.get("APHRODITE_USE_PRECOMPILED")),
			
 
				+
			
 
				+    # CMake build type
			
 
				+    # If not set, defaults to "Debug" or "RelWithDebInfo"
			
 
				+    # Available options: "Debug", "Release", "RelWithDebInfo"
			
 
				+    "CMAKE_BUILD_TYPE":
			
 
				+    lambda: os.getenv("CMAKE_BUILD_TYPE"),
			
 
				+
			
 
				+    # If set, Aphrodite will print verbose logs during installation
			
 
				+    "VERBOSE":
			
 
				+    lambda: bool(int(os.getenv('VERBOSE', '0'))),
			
 
				+
			
 
				+    # Root directory for APHRODITE configuration files
			
 
				+    # Defaults to `~/.config/aphrodite` unless `XDG_CONFIG_HOME` is set
			
 
				+    # Note that this not only affects how aphrodite finds its configuration
			
 
				+    # files during runtime, but also affects how aphrodite installs its
			
 
				+    # configuration files during **installation**.
			
 
				+    "APHRODITE_CONFIG_ROOT":
			
 
				+    lambda: os.path.expanduser(
			
 
				+        os.getenv(
			
 
				+            "APHRODITE_CONFIG_ROOT",
			
 
				+            os.path.join(get_default_config_root(), "aphrodite"),
			
 
				+        )),
			
 
				+
			
 
				+    # ================== Runtime Env Vars ==================
			
 
				+
			
 
				+    # Root directory for APHRODITE cache files
			
 
				+    # Defaults to `~/.cache/aphrodite` unless `XDG_CACHE_HOME` is set
			
 
				+    "APHRODITE_CACHE_ROOT":
			
 
				+    lambda: os.path.expanduser(
			
 
				+        os.getenv(
			
 
				+            "APHRODITE_CACHE_ROOT",
			
 
				+            os.path.join(get_default_cache_root(), "aphrodite"),
			
 
				+        )),
			
 
				+
			
 
				+    # used in distributed environment to determine the master address
			
 
				+    'APHRODITE_HOST_IP':
			
 
				+    lambda: os.getenv('APHRODITE_HOST_IP', "") or os.getenv("HOST_IP", ""),
			
 
				+
			
 
				+    # used in distributed environment to manually set the communication port
			
 
				+    # Note: if APHRODITE_PORT is set, and some code asks for multiple ports, the
			
 
				+    # APHRODITE_PORT will be used as the first port, and the rest will be
			
 
				+    # generated by incrementing the APHRODITE_PORT value.
			
 
				+    # '0' is used to make mypy happy
			
 
				+    'APHRODITE_PORT':
			
 
				+    lambda: int(os.getenv('APHRODITE_PORT', '0'))
			
 
				+    if 'APHRODITE_PORT' in os.environ else None,
			
 
				+
			
 
				+    # path used for ipc when the frontend api server is running in
			
 
				+    # multi-processing mode to communicate with the backend engine process.
			
 
				+    'APHRODITE_RPC_BASE_PATH':
			
 
				+    lambda: os.getenv('APHRODITE_RPC_BASE_PATH', tempfile.gettempdir()),
			
 
				+
			
 
				+    # If true, will load models from ModelScope instead of Hugging Face Hub.
			
 
				+    # note that the value is true or false, not numbers
			
 
				+    "APHRODITE_USE_MODELSCOPE":
			
 
				+    lambda: os.environ.get(
			
 
				+        "APHRODITE_USE_MODELSCOPE", "False").lower() == "true",
			
 
				+
			
 
				+    # Instance id represents an instance of the APHRODITE. All processes in the
			
 
				+    # same instance should have the same instance id.
			
 
				+    "APHRODITE_INSTANCE_ID":
			
 
				+    lambda: os.environ.get("APHRODITE_INSTANCE_ID", None),
			
 
				+
			
 
				+    # Interval in seconds to log a warning message when the ring buffer is full
			
 
				+    "APHRODITE_RINGBUFFER_WARNING_INTERVAL":
			
 
				+    lambda: int(os.environ.get("APHRODITE_RINGBUFFER_WARNING_INTERVAL", "60")),
			
 
				+
			
 
				+    # path to cudatoolkit home directory, under which should be bin, include,
			
 
				+    # and lib directories.
			
 
				+    "CUDA_HOME":
			
 
				+    lambda: os.environ.get("CUDA_HOME", None),
			
 
				+
			
 
				+    # Path to the NCCL library file. It is needed because nccl>=2.19 brought
			
 
				+    # by PyTorch contains a bug: https://github.com/NVIDIA/nccl/issues/1234
			
 
				+    "APHRODITE_NCCL_SO_PATH":
			
 
				+    lambda: os.environ.get("APHRODITE_NCCL_SO_PATH", None),
			
 
				+
			
 
				+    # when `APHRODITE_NCCL_SO_PATH` is not set, aphrodite will try to find the
			
 
				+    # nccl library file in the locations specified by `LD_LIBRARY_PATH`
			
 
				+    "LD_LIBRARY_PATH":
			
 
				+    lambda: os.environ.get("LD_LIBRARY_PATH", None),
			
 
				+
			
 
				+    # flag to control if aphrodite should use triton flash attention
			
 
				+    "APHRODITE_USE_TRITON_FLASH_ATTN":
			
 
				+    lambda: (os.environ.get(
			
 
				+        "APHRODITE_USE_TRITON_FLASH_ATTN", "True").lower() in ("true", "1")),
			
 
				+
			
 
				+    # Internal flag to enable Dynamo graph capture
			
 
				+    "APHRODITE_TEST_DYNAMO_GRAPH_CAPTURE":
			
 
				+    lambda: int(os.environ.get("APHRODITE_TEST_DYNAMO_GRAPH_CAPTURE", "0")),
			
 
				+
			
 
				+    # local rank of the process in the distributed setting, used to determine
			
 
				+    # the GPU device id
			
 
				+    "LOCAL_RANK":
			
 
				+    lambda: int(os.environ.get("LOCAL_RANK", "0")),
			
 
				+
			
 
				+    # used to control the visible devices in the distributed setting
			
 
				+    "CUDA_VISIBLE_DEVICES":
			
 
				+    lambda: os.environ.get("CUDA_VISIBLE_DEVICES", None),
			
 
				+
			
 
				+    # timeout for each iteration in the engine
			
 
				+    "APHRODITE_ENGINE_ITERATION_TIMEOUT_S":
			
 
				+    lambda: int(os.environ.get("APHRODITE_ENGINE_ITERATION_TIMEOUT_S", "60")),
			
 
				+
			
 
				+    # API key for APHRODITE API server
			
 
				+    "APHRODITE_API_KEY":
			
 
				+    lambda: os.environ.get("APHRODITE_API_KEY", None),
			
 
				+
			
 
				+    # S3 access information, used for tensorizer to load model from S3
			
 
				+    "S3_ACCESS_KEY_ID":
			
 
				+    lambda: os.environ.get("S3_ACCESS_KEY_ID", None),
			
 
				+    "S3_SECRET_ACCESS_KEY":
			
 
				+    lambda: os.environ.get("S3_SECRET_ACCESS_KEY", None),
			
 
				+    "S3_ENDPOINT_URL":
			
 
				+    lambda: os.environ.get("S3_ENDPOINT_URL", None),
			
 
				+
			
 
				+    # Logging configuration
			
 
				+    # If set to 0, aphrodite will not configure logging
			
 
				+    # If set to 1, aphrodite will configure logging using the default
			
 
				+    # configuration or the configuration file specified by
			
 
				+    # APHRODITE_LOGGING_CONFIG_PATH
			
 
				+    "APHRODITE_CONFIGURE_LOGGING":
			
 
				+    lambda: int(os.getenv("APHRODITE_CONFIGURE_LOGGING", "1")),
			
 
				+    "APHRODITE_LOGGING_CONFIG_PATH":
			
 
				+    lambda: os.getenv("APHRODITE_LOGGING_CONFIG_PATH"),
			
 
				+
			
 
				+    # this is used for configuring the default logging level
			
 
				+    "APHRODITE_LOGGING_LEVEL":
			
 
				+    lambda: os.getenv("APHRODITE_LOGGING_LEVEL", "INFO"),
			
 
				+
			
 
				+    # Trace function calls
			
 
				+    # If set to 1, aphrodite will trace function calls
			
 
				+    # Useful for debugging
			
 
				+    "APHRODITE_TRACE_FUNCTION":
			
 
				+    lambda: int(os.getenv("APHRODITE_TRACE_FUNCTION", "0")),
			
 
				+
			
 
				+    # Backend for attention computation
			
 
				+    # Available options:
			
 
				+    # - "TORCH_SDPA": use torch.nn.MultiheadAttention
			
 
				+    # - "FLASH_ATTN": use FlashAttention
			
 
				+    # - "XFORMERS": use XFormers
			
 
				+    # - "ROCM_FLASH": use ROCmFlashAttention
			
 
				+    # - "FLASHINFER": use flashinfer
			
 
				+    "APHRODITE_ATTENTION_BACKEND":
			
 
				+    lambda: os.getenv("APHRODITE_ATTENTION_BACKEND", None),
			
 
				+
			
 
				+    # If set, aphrodite will use flashinfer sampler
			
 
				+    "APHRODITE_USE_SAMPLING_KERNELS":
			
 
				+    lambda: bool(int(os.getenv("APHRODITE_USE_SAMPLING_KERNELS", "0"))),
			
 
				+
			
 
				+    # Pipeline stage partition strategy
			
 
				+    "APHRODITE_PP_LAYER_PARTITION":
			
 
				+    lambda: os.getenv("APHRODITE_PP_LAYER_PARTITION", None),
			
 
				+
			
 
				+    # (CPU backend only) CPU key-value cache space.
			
 
				+    # default is 4GB
			
 
				+    "APHRODITE_CPU_KVCACHE_SPACE":
			
 
				+    lambda: int(os.getenv("APHRODITE_CPU_KVCACHE_SPACE", "0")),
			
 
				+
			
 
				+    # (CPU backend only) CPU core ids bound by OpenMP threads, e.g., "0-31",
			
 
				+    # "0,1,2", "0-31,33". CPU cores of different ranks are separated by '|'.
			
 
				+    "APHRODITE_CPU_OMP_THREADS_BIND":
			
 
				+    lambda: os.getenv("APHRODITE_CPU_OMP_THREADS_BIND", "all"),
			
 
				+
			
 
				+    # OpenVINO key-value cache space
			
 
				+    # default is 4GB
			
 
				+    "APHRODITE_OPENVINO_KVCACHE_SPACE":
			
 
				+    lambda: int(os.getenv("APHRODITE_OPENVINO_KVCACHE_SPACE", "0")),
			
 
				+
			
 
				+    # OpenVINO KV cache precision
			
 
				+    # default is bf16 if natively supported by platform, otherwise f16
			
 
				+    # To enable KV cache compression, please, explicitly specify u8
			
 
				+    "APHRODITE_OPENVINO_CPU_KV_CACHE_PRECISION":
			
 
				+    lambda: os.getenv("APHRODITE_OPENVINO_CPU_KV_CACHE_PRECISION", None),
			
 
				+
			
 
				+    # Enables weights compression during model export via HF Optimum
			
 
				+    # default is False
			
 
				+    "APHRODITE_OPENVINO_ENABLE_QUANTIZED_WEIGHTS":
			
 
				+    lambda: bool(os.getenv(
			
 
				+        "APHRODITE_OPENVINO_ENABLE_QUANTIZED_WEIGHTS", False)),
			
 
				+
			
 
				+    # If the env var is set, then all workers will execute as separate
			
 
				+    # processes from the engine, and we use the same mechanism to trigger
			
 
				+    # execution on all workers.
			
 
				+    # Run aphrodite with APHRODITE_USE_RAY_SPMD_WORKER=1 to enable it.
			
 
				+    "APHRODITE_USE_RAY_SPMD_WORKER":
			
 
				+    lambda: bool(int(os.getenv("APHRODITE_USE_RAY_SPMD_WORKER", "0"))),
			
 
				+
			
 
				+    # If the env var is set, it uses the Ray's compiled DAG API
			
 
				+    # which optimizes the control plane overhead.
			
 
				+    # Run aphrodite with APHRODITE_USE_RAY_COMPILED_DAG=1 to enable it.
			
 
				+    "APHRODITE_USE_RAY_COMPILED_DAG":
			
 
				+    lambda: bool(int(os.getenv("APHRODITE_USE_RAY_COMPILED_DAG", "0"))),
			
 
				+
			
 
				+    # If the env var is set, it uses NCCL for communication in
			
 
				+    # Ray's compiled DAG. This flag is ignored if
			
 
				+    # APHRODITE_USE_RAY_COMPILED_DAG is not set.
			
 
				+    "APHRODITE_USE_RAY_COMPILED_DAG_NCCL_CHANNEL":
			
 
				+    lambda: bool(int(
			
 
				+        os.getenv("APHRODITE_USE_RAY_COMPILED_DAG_NCCL_CHANNEL", "1"))),
			
 
				+
			
 
				+    # Use dedicated multiprocess context for workers.
			
 
				+    # Both spawn and fork work
			
 
				+    "APHRODITE_WORKER_MULTIPROC_METHOD":
			
 
				+    lambda: os.getenv("APHRODITE_WORKER_MULTIPROC_METHOD", "fork"),
			
 
				+
			
 
				+    # Path to the cache for storing downloaded assets
			
 
				+    "APHRODITE_ASSETS_CACHE":
			
 
				+    lambda: os.path.expanduser(
			
 
				+        os.getenv(
			
 
				+            "APHRODITE_ASSETS_CACHE",
			
 
				+            os.path.join(get_default_cache_root(), "aphrodite", "assets"),
			
 
				+        )),
			
 
				+
			
 
				+    # Timeout for fetching images when serving multimodal models
			
 
				+    # Default is 5 seconds
			
 
				+    "APHRODITE_IMAGE_FETCH_TIMEOUT":
			
 
				+    lambda: int(os.getenv("APHRODITE_IMAGE_FETCH_TIMEOUT", "5")),
			
 
				+
			
 
				+    # Timeout for fetching audio when serving multimodal models
			
 
				+    # Default is 5 seconds
			
 
				+    "APHRODITE_AUDIO_FETCH_TIMEOUT":
			
 
				+    lambda: int(os.getenv("APHRODITE_AUDIO_FETCH_TIMEOUT", "5")),
			
 
				+
			
 
				+    # Path to the XLA persistent cache directory.
			
 
				+    # Only used for XLA devices such as TPUs.
			
 
				+    "APHRODITE_XLA_CACHE_PATH":
			
 
				+    lambda: os.path.expanduser(
			
 
				+        os.getenv(
			
 
				+            "APHRODITE_XLA_CACHE_PATH",
			
 
				+            os.path.join(get_default_cache_root(), "aphrodite", "xla_cache"),
			
 
				+        )),
			
 
				+    "APHRODITE_FUSED_MOE_CHUNK_SIZE":
			
 
				+    lambda: int(os.getenv("APHRODITE_FUSED_MOE_CHUNK_SIZE", "65536")),
			
 
				+
			
 
				+    # If set, aphrodite will skip the deprecation warnings.
			
 
				+    "APHRODITE_NO_DEPRECATION_WARNING":
			
 
				+    lambda: bool(int(os.getenv("APHRODITE_NO_DEPRECATION_WARNING", "0"))),
			
 
				+
			
 
				+    # If set, the OpenAI API server will stay alive even after the underlying
			
 
				+    # AsyncLLMEngine errors and stops serving requests
			
 
				+    "APHRODITE_KEEP_ALIVE_ON_ENGINE_DEATH":
			
 
				+    lambda: bool(os.getenv("APHRODITE_KEEP_ALIVE_ON_ENGINE_DEATH", 0)),
			
 
				+
			
 
				+    # If the env var APHRODITE_DYNAMIC_ROPE_SCALING is set, it allows
			
 
				+    # the user to specify a max sequence length greater than
			
 
				+    # the max length derived from the model's config.json.
			
 
				+    # To enable this, set APHRODITE_DYNAMIC_ROPE_SCALING=1.
			
 
				+    "APHRODITE_DYNAMIC_ROPE_SCALING":
			
 
				+    lambda:
			
 
				+    (os.environ.get(
			
 
				+        "APHRODITE_DYNAMIC_ROPE_SCALING",
			
 
				+        "0").strip().lower() in ("1", "true")),
			
 
				+
			
 
				+    # If set, forces FP8 Marlin to be used for FP8 quantization regardless
			
 
				+    # of the hardware support for FP8 compute.
			
 
				+    "APHRODITE_TEST_FORCE_FP8_MARLIN":
			
 
				+    lambda:
			
 
				+    (os.environ.get("APHRODITE_TEST_FORCE_FP8_MARLIN", "0").strip().lower() in
			
 
				+     ("1", "true")),
			
 
				+
			
 
				+    # If set, allow running the engine as a separate ray actor,
			
 
				+    # which is a deprecated feature soon to be removed.
			
 
				+    "APHRODITE_ALLOW_ENGINE_USE_RAY":
			
 
				+    lambda:
			
 
				+    (os.environ.get("APHRODITE_ALLOW_ENGINE_USE_RAY", "0").strip().lower() in
			
 
				+     ("1", "true")),
			
 
				+
			
 
				+    # a list of plugin names to load, separated by commas.
			
 
				+    # if this is not set, it means all plugins will be loaded
			
 
				+    # if this is set to an empty string, no plugins will be loaded
			
 
				+    "APHRODITE_PLUGINS":
			
 
				+    lambda: None if "APHRODITE_PLUGINS" not in os.environ else os.environ[
			
 
				+        "APHRODITE_PLUGINS"].split(","),
			
 
				+}
			
 
				+
			
 
				+# end-env-vars-definition
			
 
				+
			
 
				+
			
 
				+def __getattr__(name: str):
			
 
				+    # lazy evaluation of environment variables
			
 
				+    if name in environment_variables:
			
 
				+        return environment_variables[name]()
			
 
				+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
			
 
				+
			
 
				+
			
 
				+def __dir__():
			
 
				+    return list(environment_variables.keys())
			
--- a/aphrodite/executor/cpu_executor.py
+++ b/aphrodite/executor/cpu_executor.py
@@ -5,6 +5,7 @@ from typing import Any, Awaitable, List, Optional, Set, Tuple, Union
 
				 import torch
			
 
				 from loguru import logger
			
 
				 
			
 
				+from aphrodite import envs
			
 
				 from aphrodite.common.config import CacheConfig, ModelConfig, SchedulerConfig
			
 
				 from aphrodite.common.sequence import ExecuteModelRequest, SamplerOutput
			
 
				 from aphrodite.common.utils import (GiB_bytes, get_aphrodite_instance_id,
			
@@ -333,7 +334,7 @@ def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig:
 
				         logger.warning("Prefix caching is not supported on CPU, disable it.")
			
 
				         config.enable_prefix_caching = False
			
 
				 
			
 
				-    kv_cache_space_str = os.getenv("APHRODITE_CPU_KVCACHE_SPACE", "0")
			
 
				+    kv_cache_space_str = envs.APHRODITE_CPU_KVCACHE_SPACE
			
 
				     kv_cache_space = int(kv_cache_space_str)
			
 
				 
			
 
				     if kv_cache_space >= 0:
			
--- a/aphrodite/executor/multiproc_worker_utils.py
+++ b/aphrodite/executor/multiproc_worker_utils.py
@@ -14,6 +14,8 @@ from typing import (Any, Callable, Dict, Generic, List, Optional, TextIO,
 
				 
			
 
				 from loguru import logger
			
 
				 
			
 
				+from aphrodite import envs
			
 
				+
			
 
				 T = TypeVar('T')
			
 
				 
			
 
				 _TERMINATE = "TERMINATE"  # sentinel
			
@@ -26,7 +28,7 @@ JOIN_TIMEOUT_S = 2
 
				 
			
 
				 # Use dedicated multiprocess context for workers.
			
 
				 # Both spawn and fork work
			
 
				-mp_method = os.getenv("APHRODITE_WORKER_MULTIPROC_METHOD", "fork")
			
 
				+mp_method = envs.APHRODITE_WORKER_MULTIPROC_METHOD
			
 
				 mp = multiprocessing.get_context(mp_method)
			
 
				 
			
 
				 
			
--- a/aphrodite/executor/openvino_executor.py
+++ b/aphrodite/executor/openvino_executor.py
@@ -1,4 +1,3 @@
 
				-import os
			
 
				 from typing import List, Set, Tuple
			
 
				 
			
 
				 import openvino as ov
			
@@ -6,6 +5,7 @@ import openvino.properties.hint as hints
 
				 import torch
			
 
				 from loguru import logger
			
 
				 
			
 
				+from aphrodite import envs
			
 
				 from aphrodite.common.config import CacheConfig, ModelConfig
			
 
				 from aphrodite.common.sequence import ExecuteModelRequest, SamplerOutput
			
 
				 from aphrodite.common.utils import (GiB_bytes, get_distributed_init_method,
			
@@ -13,10 +13,9 @@ from aphrodite.common.utils import (GiB_bytes, get_distributed_init_method,
 
				 from aphrodite.executor.executor_base import ExecutorAsyncBase, ExecutorBase
			
 
				 from aphrodite.lora.request import LoRARequest
			
 
				 
			
 
				-APHRODITE_OPENVINO_KVCACHE_SPACE = int(
			
 
				-    os.getenv("APHRODITE_OPENVINO_KVCACHE_SPACE", 0))
			
 
				-APHRODITE_OPENVINO_CPU_KV_CACHE_PRECISION = os.getenv(
			
 
				-    "APHRODITE_OPENVINO_CPU_KV_CACHE_PRECISION", None)
			
 
				+APHRODITE_OPENVINO_KVCACHE_SPACE = envs.APHRODITE_OPENVINO_KVCACHE_SPACE
			
 
				+APHRODITE_OPENVINO_CPU_KV_CACHE_PRECISION = (
			
 
				+    envs.APHRODITE_OPENVINO_CPU_KV_CACHE_PRECISION)
			
 
				 
			
 
				 
			
 
				 class OpenVINOExecutor(ExecutorBase):
			
--- a/aphrodite/executor/ray_gpu_executor.py
+++ b/aphrodite/executor/ray_gpu_executor.py
@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
 
				 import msgspec
			
 
				 from loguru import logger
			
 
				 
			
 
				+from aphrodite import envs
			
 
				 from aphrodite.common.sequence import ExecuteModelRequest, SamplerOutput
			
 
				 from aphrodite.common.utils import (_run_task_with_lock,
			
 
				                                     get_aphrodite_instance_id,
			
@@ -26,14 +27,12 @@ if TYPE_CHECKING:
 
				 # If the env var is set, it uses the Ray's compiled DAG API
			
 
				 # which optimizes the control plane overhead.
			
 
				 # Run Aphrodite with APHRODITE_USE_RAY_COMPILED_DAG=1 to enable it.
			
 
				-APHRODITE_USE_RAY_COMPILED_DAG = bool(
			
 
				-    os.getenv("APHRODITE_USE_RAY_COMPILED_DAG", 0))
			
 
				-APHRODITE_TRACE_FUNCTION = int(os.getenv("APHRODITE_TRACE_FUNCTION", 0))
			
 
				-APHRODITE_USE_RAY_SPMD_WORKER = bool(
			
 
				-    os.getenv("APHRODITE_USE_RAY_SPMD_WORKER", 0))
			
 
				-
			
 
				-APHRODITE_USE_RAY_COMPILED_DAG_NCCL_CHANNEL = bool(
			
 
				-    int(os.getenv("APHRODITE_USE_RAY_COMPILED_DAG_NCCL_CHANNEL", 1)))
			
 
				+APHRODITE_USE_RAY_COMPILED_DAG = envs.APHRODITE_USE_RAY_COMPILED_DAG
			
 
				+APHRODITE_TRACE_FUNCTION = envs.APHRODITE_TRACE_FUNCTION
			
 
				+APHRODITE_USE_RAY_SPMD_WORKER = envs.APHRODITE_USE_RAY_SPMD_WORKER
			
 
				+
			
 
				+APHRODITE_USE_RAY_COMPILED_DAG_NCCL_CHANNEL = (
			
 
				+    envs.APHRODITE_USE_RAY_COMPILED_DAG_NCCL_CHANNEL)
			
 
				 
			
 
				 
			
 
				 class RayGPUExecutor(DistributedGPUExecutor):
			
--- a/aphrodite/executor/ray_tpu_executor.py
+++ b/aphrodite/executor/ray_tpu_executor.py
@@ -7,6 +7,7 @@ from typing import (TYPE_CHECKING, Any, Awaitable, Dict, List, Optional, Tuple,
 
				 
			
 
				 from loguru import logger
			
 
				 
			
 
				+from aphrodite import envs
			
 
				 from aphrodite.common.sequence import ExecuteModelRequest, SamplerOutput
			
 
				 from aphrodite.common.utils import (get_aphrodite_instance_id,
			
 
				                                     get_distributed_init_method, get_ip,
			
@@ -21,7 +22,7 @@ if ray is not None:
 
				 if TYPE_CHECKING:
			
 
				     from ray.util.placement_group import PlacementGroup
			
 
				 
			
 
				-APHRODITE_TRACE_FUNCTION = int(os.getenv("APHRODITE_TRACE_FUNCTION", 0))
			
 
				+APHRODITE_TRACE_FUNCTION = envs.APHRODITE_TRACE_FUNCTION
			
 
				 
			
 
				 
			
 
				 class RayTPUExecutor(TPUExecutor):
			
--- a/aphrodite/executor/ray_xpu_executor.py
+++ b/aphrodite/executor/ray_xpu_executor.py
@@ -7,6 +7,7 @@ from typing import (TYPE_CHECKING, Any, Awaitable, Dict, List, Optional, Set,
 
				 
			
 
				 from loguru import logger
			
 
				 
			
 
				+from aphrodite import envs
			
 
				 from aphrodite.common.config import (CacheConfig, DeviceConfig, LoadConfig,
			
 
				                                      LoRAConfig, ModelConfig, ParallelConfig,
			
 
				                                      PromptAdapterConfig, SchedulerConfig,
			
@@ -28,7 +29,7 @@ if TYPE_CHECKING:
 
				 # If the env var is set, it uses the Ray's compiled DAG API
			
 
				 # which optimizes the control plane overhead.
			
 
				 # Run Aphrodite with APHRODITE_USE_RAY_COMPILED_DAG=1 to enable it.
			
 
				-USE_RAY_COMPILED_DAG = bool(os.getenv("APHRODITE_USE_RAY_COMPILED_DAG", 0))
			
 
				+USE_RAY_COMPILED_DAG = envs.APHRODITE_USE_RAY_COMPILED_DAG
			
 
				 
			
 
				 
			
 
				 class RayXPUExecutor(DistributedGPUExecutor):
			
--- a/aphrodite/modeling/layers/fused_moe/fused_moe.py
+++ b/aphrodite/modeling/layers/fused_moe/fused_moe.py
@@ -10,10 +10,10 @@ import triton.language as tl
 
				 from loguru import logger
			
 
				 
			
 
				 from aphrodite import _custom_ops as ops
			
 
				+from aphrodite import envs
			
 
				 from aphrodite.platforms import current_platform
			
 
				 
			
 
				-APHRODITE_FUSED_MOE_CHUNK_SIZE = int(
			
 
				-    os.getenv("APHRODITE_FUSED_MOE_CHUNK_SIZE", "65536"))
			
 
				+APHRODITE_FUSED_MOE_CHUNK_SIZE = envs.APHRODITE_FUSED_MOE_CHUNK_SIZE
			
 
				 
			
 
				 
			
 
				 @triton.jit
			
--- a/aphrodite/modeling/layers/sampler.py
+++ b/aphrodite/modeling/layers/sampler.py
@@ -1,6 +1,5 @@
 
				 """A layer that samples the next tokens from the model's outputs."""
			
 
				 import itertools
			
 
				-import os
			
 
				 import warnings
			
 
				 from enum import IntEnum
			
 
				 from math import inf
			
@@ -11,6 +10,7 @@ import torch.nn as nn
 
				 from loguru import logger
			
 
				 
			
 
				 import aphrodite._custom_ops as ops
			
 
				+from aphrodite import envs
			
 
				 from aphrodite.common.sampling_params import SamplingType
			
 
				 from aphrodite.common.sequence import (CompletionSequenceGroupOutput, Logprob,
			
 
				                                        PromptLogprobs, SampleLogprobs,
			
@@ -34,8 +34,7 @@ _TEMPERATURE_MINIMUM = 2e-5
 
				 
			
 
				 # If enabled, we switch to a more performant implementation
			
 
				 # of top-k and top-p
			
 
				-APHRODITE_USE_SAMPLING_KERNELS = bool(int(
			
 
				-    os.getenv("APHRODITE_USE_SAMPLING_KERNELS", "0")))
			
 
				+APHRODITE_USE_SAMPLING_KERNELS = envs.APHRODITE_USE_SAMPLING_KERNELS
			
 
				 
			
 
				 
			
 
				 class SamplerID(IntEnum):
			
--- a/aphrodite/modeling/model_loader/openvino.py
+++ b/aphrodite/modeling/model_loader/openvino.py
@@ -1,5 +1,4 @@
 
				 # ruff: noqa: SIM117
			
 
				-import os
			
 
				 from pathlib import Path
			
 
				 from typing import List, Optional, Tuple
			
 
				 
			
@@ -11,6 +10,7 @@ from openvino._offline_transformations import paged_attention_transformation
 
				 from optimum.intel import OVModelForCausalLM
			
 
				 from torch import nn
			
 
				 
			
 
				+from aphrodite import envs
			
 
				 from aphrodite.attention.backends.openvino import OpenVINOAttentionMetadata
			
 
				 from aphrodite.common.config import DeviceConfig, ModelConfig
			
 
				 from aphrodite.common.sequence import SamplerOutput
			
@@ -19,8 +19,8 @@ from aphrodite.modeling.layers.logits_processor import (LogitsProcessor,
 
				 from aphrodite.modeling.layers.sampler import Sampler
			
 
				 from aphrodite.modeling.sampling_metadata import SamplingMetadata
			
 
				 
			
 
				-APHRODITE_OPENVINO_ENABLE_QUANTIZED_WEIGHTS = bool(
			
 
				-    os.getenv("APHRODITE_OPENVINO_ENABLE_QUANTIZED_WEIGHTS", False))
			
 
				+APHRODITE_OPENVINO_ENABLE_QUANTIZED_WEIGHTS = (
			
 
				+    envs.APHRODITE_OPENVINO_ENABLE_QUANTIZED_WEIGHTS)
			
 
				 
			
 
				 
			
 
				 def _flattenize_inputs(inputs):
			
--- a/aphrodite/modeling/model_loader/tensorizer.py
+++ b/aphrodite/modeling/model_loader/tensorizer.py
@@ -13,6 +13,7 @@ from loguru import logger
 
				 from torch import nn
			
 
				 from transformers import PretrainedConfig
			
 
				 
			
 
				+from aphrodite import envs
			
 
				 from aphrodite.common.config import ModelConfig, ParallelConfig
			
 
				 from aphrodite.engine.aphrodite_engine import AphroditeEngine
			
 
				 from aphrodite.engine.args_tools import EngineArgs
			
@@ -148,12 +149,12 @@ class TensorizerArgs:
 
				     def __post_init__(self):
			
 
				         self.file_obj = self.tensorizer_uri
			
 
				         self.s3_access_key_id = (self.s3_access_key_id
			
 
				-                                 or os.environ.get("S3_ACCESS_KEY_ID")) or None
			
 
				+                                 or envs.S3_ACCESS_KEY_ID) or None
			
 
				         self.s3_secret_access_key = (
			
 
				             self.s3_secret_access_key
			
 
				-            or os.environ.get("S3_SECRET_ACCESS_KEY")) or None
			
 
				+            or envs.S3_SECRET_ACCESS_KEY) or None
			
 
				         self.s3_endpoint = (self.s3_endpoint
			
 
				-                            or os.environ.get("S3_ENDPOINT_URL")) or None
			
 
				+                            or envs.S3_ENDPOINT_URL) or None
			
 
				         self.stream_params = {
			
 
				             "s3_access_key_id": self.s3_access_key_id,
			
 
				             "s3_secret_access_key": self.s3_secret_access_key,
			
--- a/aphrodite/multimodal/utils.py
+++ b/aphrodite/multimodal/utils.py
@@ -1,5 +1,4 @@
 
				 import base64
			
 
				-import os
			
 
				 from io import BytesIO
			
 
				 from typing import Tuple, Union
			
 
				 
			
@@ -8,14 +7,13 @@ import numpy as np
 
				 import soundfile
			
 
				 from PIL import Image
			
 
				 
			
 
				+from aphrodite import envs
			
 
				 from aphrodite.common.connections import global_http_connection
			
 
				 from aphrodite.multimodal.base import MultiModalDataDict
			
 
				 
			
 
				-APHRODITE_IMAGE_FETCH_TIMEOUT = int(
			
 
				-    os.getenv("APHRODITE_IMAGE_FETCH_TIMEOUT", 10))
			
 
				+APHRODITE_IMAGE_FETCH_TIMEOUT = envs.APHRODITE_IMAGE_FETCH_TIMEOUT
			
 
				 
			
 
				-APHRODITE_AUDIO_FETCH_TIMEOUT = int(
			
 
				-    os.getenv("APHRODITE_AUDIO_FETCH_TIMEOUT", 10))
			
 
				+APHRODITE_AUDIO_FETCH_TIMEOUT = envs.APHRODITE_AUDIO_FETCH_TIMEOUT
			
 
				 
			
 
				 
			
 
				 def _load_image_from_bytes(b: bytes):
			
--- a/aphrodite/plugins/__init__.py
+++ b/aphrodite/plugins/__init__.py
@@ -1,9 +1,7 @@
 
				-import os
			
 
				-
			
 
				 from loguru import logger
			
 
				 
			
 
				-APHRODITE_PLUGINS = None if "APHRODITE_PLUGINS" not in os.environ else \
			
 
				-    os.environ["APHRODITE_PLUGINS"].split(",")
			
 
				+from aphrodite import envs
			
 
				+
			
 
				 
			
 
				 def load_general_plugins():
			
 
				     """WARNING: plugins can be loaded for multiple times in different
			
@@ -16,7 +14,7 @@ def load_general_plugins():
 
				     else:
			
 
				         from importlib.metadata import entry_points
			
 
				 
			
 
				-    allowed_plugins = APHRODITE_PLUGINS
			
 
				+    allowed_plugins = envs.APHRODITE_PLUGINS
			
 
				 
			
 
				     discovered_plugins = entry_points(group='aphrodite.general_plugins')
			
 
				     for plugin in discovered_plugins:
			
--- a/aphrodite/quantization/fp8.py
+++ b/aphrodite/quantization/fp8.py
@@ -1,4 +1,3 @@
 
				-import os
			
 
				 from typing import Any, Dict, List, Optional
			
 
				 
			
 
				 import torch
			
@@ -7,6 +6,7 @@ from torch.nn import Module
 
				 from torch.nn.parameter import Parameter
			
 
				 
			
 
				 from aphrodite import _custom_ops as ops
			
 
				+from aphrodite import envs
			
 
				 from aphrodite.common.utils import is_hip, print_warning_once
			
 
				 from aphrodite.modeling.layers.fused_moe import FusedMoE, FusedMoEMethodBase
			
 
				 from aphrodite.modeling.layers.linear import (LinearBase, LinearMethodBase,
			
@@ -26,8 +26,7 @@ from aphrodite.quantization.utils.w8a8_utils import (
 
				     requantize_with_max_scale)
			
 
				 
			
 
				 ACTIVATION_SCHEMES = ["static", "dynamic"]
			
 
				-APHRODITE_TEST_FORCE_FP8_MARLIN = os.environ.get(
			
 
				-    "APHRODITE_TEST_FORCE_FP8_MARLIN", "0").strip().lower() in ("1", "true")
			
 
				+APHRODITE_TEST_FORCE_FP8_MARLIN = envs.APHRODITE_TEST_FORCE_FP8_MARLIN
			
 
				 
			
 
				 
			
 
				 class Fp8Config(QuantizationConfig):
			
--- a/aphrodite/server/launch.py
+++ b/aphrodite/server/launch.py
@@ -1,5 +1,4 @@
 
				 import asyncio
			
 
				-import os
			
 
				 import signal
			
 
				 from http import HTTPStatus
			
 
				 from typing import Any
			
@@ -8,12 +7,13 @@ import uvicorn
 
				 from fastapi import FastAPI, Response
			
 
				 from loguru import logger
			
 
				 
			
 
				+from aphrodite import envs
			
 
				 from aphrodite.common.utils import find_process_using_port, in_windows
			
 
				 from aphrodite.engine.async_aphrodite import AsyncEngineDeadError
			
 
				 from aphrodite.engine.protocol import AsyncEngineClient
			
 
				 
			
 
				-APHRODITE_KEEP_ALIVE_ON_ENGINE_DEATH = bool(os.getenv(
			
 
				-    "APHRODITE_KEEP_ALIVE_ON_ENGINE_DEATH", 0))
			
 
				+APHRODITE_KEEP_ALIVE_ON_ENGINE_DEATH = (
			
 
				+    envs.APHRODITE_KEEP_ALIVE_ON_ENGINE_DEATH)
			
 
				 
			
 
				 
			
 
				 async def serve_http(app: FastAPI, engine: AsyncEngineClient,
			
--- a/aphrodite/task_handler/cpu_worker.py
+++ b/aphrodite/task_handler/cpu_worker.py
@@ -1,10 +1,10 @@
 
				 """A CPU worker class."""
			
 
				-import os
			
 
				 from typing import Dict, List, Optional, Tuple
			
 
				 
			
 
				 import torch
			
 
				 import torch.distributed
			
 
				 
			
 
				+from aphrodite import envs
			
 
				 from aphrodite.attention import get_attn_backend
			
 
				 from aphrodite.common.config import (CacheConfig, DeviceConfig, LoadConfig,
			
 
				                                      LoRAConfig, ModelConfig, ParallelConfig,
			
@@ -19,8 +19,7 @@ from aphrodite.task_handler.worker_base import (LocalOrDistributedWorkerBase,
 
				                                                 LoraNotSupportedWorkerBase,
			
 
				                                                 WorkerInput)
			
 
				 
			
 
				-APHRODITE_CPU_OMP_THREADS_BIND = os.getenv("APHRODITE_CPU_OMP_THREADS_BIND",
			
 
				-                                           "all")
			
 
				+APHRODITE_CPU_OMP_THREADS_BIND = envs.APHRODITE_CPU_OMP_THREADS_BIND
			
 
				 
			
 
				 
			
 
				 class CPUCacheEngine:
			
--- a/aphrodite/task_handler/tpu_worker.py
+++ b/aphrodite/task_handler/tpu_worker.py
@@ -5,6 +5,7 @@ import torch
 
				 import torch_xla.core.xla_model as xm
			
 
				 import torch_xla.runtime as xr
			
 
				 
			
 
				+from aphrodite import envs
			
 
				 from aphrodite.common.config import (CacheConfig, DeviceConfig, LoadConfig,
			
 
				                                      ModelConfig, ParallelConfig,
			
 
				                                      SchedulerConfig)
			
@@ -99,8 +100,7 @@ class TPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
 
				         # Use persistent cache to avoid XLA recompilation.
			
 
				         # NOTE: Set per-rank cache path since different ranks
			
 
				         # can have slightly different XLA graphs.
			
 
				-        APHRODITE_XLA_CACHE_PATH = os.getenv("APHRODITE_XLA_CACHE_PATH",
			
 
				-                                             "~/.aphrodite/xla_cache/")
			
 
				+        APHRODITE_XLA_CACHE_PATH = envs.APHRODITE_XLA_CACHE_PATH
			
 
				         world_size = self.parallel_config.world_size
			
 
				         per_rank_path = os.path.join(APHRODITE_XLA_CACHE_PATH,
			
 
				                                      f"tp{world_size}_rank{self.rank}")
			
--- a/aphrodite/transformers_utils/config.py
+++ b/aphrodite/transformers_utils/config.py
@@ -1,7 +1,6 @@
 
				 import contextlib
			
 
				 import enum
			
 
				 import json
			
 
				-import os
			
 
				 from pathlib import Path
			
 
				 from typing import Any, Dict, Optional, Type, Union
			
 
				 
			
@@ -14,6 +13,7 @@ from transformers.models.auto.modeling_auto import (
 
				     MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)
			
 
				 from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME
			
 
				 
			
 
				+from aphrodite import envs
			
 
				 from aphrodite.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
			
 
				                                                   InternVLChatConfig,
			
 
				                                                   JAISConfig, MedusaConfig,
			
@@ -21,7 +21,7 @@ from aphrodite.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
 
				                                                   MPTConfig, RWConfig)
			
 
				 from aphrodite.transformers_utils.utils import check_gguf_file
			
 
				 
			
 
				-APHRODITE_USE_MODELSCOPE = os.getenv("APHRODITE_USE_MODELSCOPE", "0") == "1"
			
 
				+APHRODITE_USE_MODELSCOPE = envs.APHRODITE_USE_MODELSCOPE
			
 
				 
			
 
				 if APHRODITE_USE_MODELSCOPE:
			
 
				     from modelscope import AutoConfig
			
--- a/examples/tensorize_aphrodite_model.py
+++ b/examples/tensorize_aphrodite_model.py
@@ -1,10 +1,9 @@
 
				 import argparse
			
 
				 import dataclasses
			
 
				 import json
			
 
				-import os
			
 
				 import uuid
			
 
				 
			
 
				-from aphrodite import LLM
			
 
				+from aphrodite import LLM, envs
			
 
				 from aphrodite.engine.args_tools import EngineArgs
			
 
				 from aphrodite.modeling.model_loader.tensorizer import (
			
 
				     TensorizerArgs, TensorizerConfig, tensorize_aphrodite_model)
			
@@ -177,11 +176,11 @@ if __name__ == '__main__':
 
				     args = parse_args()
			
 
				 
			
 
				     s3_access_key_id = (getattr(args, 's3_access_key_id', None)
			
 
				-                        or os.environ.get("S3_ACCESS_KEY_ID", None))
			
 
				+                        or envs.S3_ACCESS_KEY_ID)
			
 
				     s3_secret_access_key = (getattr(args, 's3_secret_access_key', None)
			
 
				-                            or os.environ.get("S3_SECRET_ACCESS_KEY", None))
			
 
				+                            or envs.S3_SECRET_ACCESS_KEY)
			
 
				     s3_endpoint = (getattr(args, 's3_endpoint', None)
			
 
				-                or os.environ.get("S3_ENDPOINT_URL", None))
			
 
				+                or envs.S3_ENDPOINT_URL)
			
 
				 
			
 
				     credentials = {
			
 
				         "s3_access_key_id": s3_access_key_id,
			
--- a/setup.py
+++ b/setup.py
@@ -1,3 +1,4 @@
 
				+import importlib.util
			
 
				 import io
			
 
				 import logging
			
 
				 import os
			
@@ -14,10 +15,16 @@ from setuptools import Extension, find_packages, setup
 
				 from setuptools.command.build_ext import build_ext
			
 
				 from torch.utils.cpp_extension import CUDA_HOME
			
 
				 
			
 
				+
			
 
				+def load_module_from_path(module_name, path):
			
 
				+    spec = importlib.util.spec_from_file_location(module_name, path)
			
 
				+    module = importlib.util.module_from_spec(spec)
			
 
				+    sys.modules[module_name] = module
			
 
				+    spec.loader.exec_module(module)
			
 
				+    return module
			
 
				+
			
 
				 ROOT_DIR = os.path.dirname(__file__)
			
 
				 logger = logging.getLogger(__name__)
			
 
				-# Target device of Aphrodite, supporting [cuda (by default), rocm, neuron, cpu]
			
 
				-APHRODITE_TARGET_DEVICE = os.getenv("APHRODITE_TARGET_DEVICE", "cuda")
			
 
				 
			
 
				 
			
 
				 def embed_commit_hash():
			
@@ -47,6 +54,14 @@ def embed_commit_hash():
 
				 
			
 
				 embed_commit_hash()
			
 
				 
			
 
				+
			
 
				+# cannot import envs directly because it depends on aphrodite,
			
 
				+#  which is not installed yet
			
 
				+envs = load_module_from_path('envs', os.path.join(
			
 
				+    ROOT_DIR, 'aphrodite', 'envs.py'))
			
 
				+
			
 
				+APHRODITE_TARGET_DEVICE = envs.APHRODITE_TARGET_DEVICE
			
 
				+
			
 
				 if not sys.platform.startswith("linux"):
			
 
				     logger.warning(
			
 
				         "Aphrodite only supports Linux platform (including WSL). "
			
@@ -97,7 +112,7 @@ class cmake_build_ext(build_ext):
 
				     def compute_num_jobs(self):
			
 
				         # `num_jobs` is either the value of the MAX_JOBS environment variable
			
 
				         # (if defined) or the number of CPUs available.
			
 
				-        num_jobs = os.environ.get("MAX_JOBS", None)
			
 
				+        num_jobs = envs.MAX_JOBS
			
 
				         if num_jobs is not None:
			
 
				             num_jobs = int(num_jobs)
			
 
				             logger.info(f"Using MAX_JOBS={num_jobs} as the number of jobs.")
			
@@ -118,7 +133,7 @@ class cmake_build_ext(build_ext):
 
				             # environment variable (if defined) or 1.
			
 
				             # when it is set, we reduce `num_jobs` to avoid
			
 
				             # overloading the system.
			
 
				-            nvcc_threads = os.getenv("NVCC_THREADS", None)
			
 
				+            nvcc_threads = envs.NVCC_THREADS
			
 
				             if nvcc_threads is not None:
			
 
				                 nvcc_threads = int(nvcc_threads)
			
 
				                 logger.info(f"Using NVCC_THREADS={nvcc_threads} as the number"
			
@@ -143,7 +158,7 @@ class cmake_build_ext(build_ext):
 
				         # Select the build type.
			
 
				         # Note: optimization level + debug info are set by the build type
			
 
				         default_cfg = "Debug" if self.debug else "RelWithDebInfo"
			
 
				-        cfg = os.getenv("CMAKE_BUILD_TYPE", default_cfg)
			
 
				+        cfg = envs.CMAKE_BUILD_TYPE or default_cfg
			
 
				 
			
 
				         # where .so files will be written, should be the same for all extensions
			
 
				         # that use the same CMakeLists.txt.
			
@@ -161,7 +176,7 @@ class cmake_build_ext(build_ext):
 
				             '-DAPHRODITE_TARGET_DEVICE={}'.format(APHRODITE_TARGET_DEVICE),
			
 
				         ]
			
 
				 
			
 
				-        verbose = bool(int(os.getenv('VERBOSE', '0')))
			
 
				+        verbose = envs.VERBOSE
			
 
				         if verbose:
			
 
				             cmake_args += ['-DCMAKE_VERBOSE_MAKEFILE=ON']
			
 
				 
			
@@ -469,7 +484,7 @@ package_data = {
 
				         "py.typed", "modeling/layers/fused_moe/configs/*.json"
			
 
				     ]
			
 
				 }
			
 
				-if os.environ.get("APHRODITE_USE_PRECOMPILED"):
			
 
				+if envs.APHRODITE_USE_PRECOMPILED:
			
 
				     ext_modules = []
			
 
				     package_data["aphrodite"].append("*.so")