소스 검색

chore: consolidate environment variables within one file (#882)

AlpinDale 3 달 전
부모
커밋
901900854e
36개의 변경된 파일543개의 추가작업 그리고 132개의 파일을 삭제
  1. 3 7
      aphrodite/assets/base.py
  2. 2 4
      aphrodite/attention/backends/rocm_flash_attn.py
  3. 2 1
      aphrodite/attention/selector.py
  4. 34 16
      aphrodite/common/config.py
  5. 3 2
      aphrodite/common/logger.py
  6. 3 3
      aphrodite/common/sampling_params.py
  7. 8 11
      aphrodite/common/utils.py
  8. 2 2
      aphrodite/distributed/device_communicators/custom_all_reduce.py
  9. 7 6
      aphrodite/distributed/device_communicators/custom_all_reduce_utils.py
  10. 3 3
      aphrodite/distributed/device_communicators/shm_broadcast.py
  11. 3 2
      aphrodite/distributed/parallel_state.py
  12. 3 2
      aphrodite/distributed/utils.py
  13. 2 1
      aphrodite/endpoints/openai/api_server.py
  14. 2 3
      aphrodite/engine/aphrodite_engine.py
  15. 2 3
      aphrodite/engine/args_tools.py
  16. 2 3
      aphrodite/engine/async_aphrodite.py
  17. 388 0
      aphrodite/envs.py
  18. 2 1
      aphrodite/executor/cpu_executor.py
  19. 3 1
      aphrodite/executor/multiproc_worker_utils.py
  20. 4 5
      aphrodite/executor/openvino_executor.py
  21. 7 8
      aphrodite/executor/ray_gpu_executor.py
  22. 2 1
      aphrodite/executor/ray_tpu_executor.py
  23. 2 1
      aphrodite/executor/ray_xpu_executor.py
  24. 2 2
      aphrodite/modeling/layers/fused_moe/fused_moe.py
  25. 2 3
      aphrodite/modeling/layers/sampler.py
  26. 3 3
      aphrodite/modeling/model_loader/openvino.py
  27. 4 3
      aphrodite/modeling/model_loader/tensorizer.py
  28. 3 5
      aphrodite/multimodal/utils.py
  29. 3 5
      aphrodite/plugins/__init__.py
  30. 2 3
      aphrodite/quantization/fp8.py
  31. 3 3
      aphrodite/server/launch.py
  32. 2 3
      aphrodite/task_handler/cpu_worker.py
  33. 2 2
      aphrodite/task_handler/tpu_worker.py
  34. 2 2
      aphrodite/transformers_utils/config.py
  35. 4 5
      examples/tensorize_aphrodite_model.py
  36. 22 7
      setup.py

+ 3 - 7
aphrodite/assets/base.py

@@ -5,6 +5,7 @@ from functools import lru_cache
 from pathlib import Path
 from pathlib import Path
 from typing import Optional
 from typing import Optional
 
 
+from aphrodite import envs
 from aphrodite.connections import global_http_connection
 from aphrodite.connections import global_http_connection
 
 
 
 
@@ -15,13 +16,8 @@ def get_default_cache_root():
     )
     )
 
 
 vLLM_S3_BUCKET_URL = "https://vllm-public-assets.s3.us-west-2.amazonaws.com"
 vLLM_S3_BUCKET_URL = "https://vllm-public-assets.s3.us-west-2.amazonaws.com"
-APHRODITE_ASSETS_CACHE = os.path.expanduser(
-    os.getenv(
-        "APHRODITE_ASSETS_CACHE",
-        os.path.join(get_default_cache_root(), "aphrodite", "assets"),
-    ))
-APHRODITE_IMAGE_FETCH_TIMEOUT = int(os.getenv("APHRODITE_IMAGE_FETCH_TIMEOUT",
-                                              5))
+APHRODITE_ASSETS_CACHE = envs.APHRODITE_ASSETS_CACHE
+APHRODITE_IMAGE_FETCH_TIMEOUT = envs.APHRODITE_IMAGE_FETCH_TIMEOUT
 
 
 def get_cache_dir() -> Path:
 def get_cache_dir() -> Path:
     """Get the path to the cache for storing downloaded assets."""
     """Get the path to the cache for storing downloaded assets."""

+ 2 - 4
aphrodite/attention/backends/rocm_flash_attn.py

@@ -1,11 +1,11 @@
 """Attention layer ROCm GPUs."""
 """Attention layer ROCm GPUs."""
-import os
 from dataclasses import dataclass
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Type
 from typing import Any, Dict, List, Optional, Tuple, Type
 
 
 import torch
 import torch
 from loguru import logger
 from loguru import logger
 
 
+from aphrodite import envs
 from aphrodite.attention.backends.abstract import (AttentionBackend,
 from aphrodite.attention.backends.abstract import (AttentionBackend,
                                                    AttentionImpl,
                                                    AttentionImpl,
                                                    AttentionMetadata,
                                                    AttentionMetadata,
@@ -280,9 +280,7 @@ class ROCmFlashAttentionImpl(AttentionImpl):
 
 
         self.use_naive_attn = False
         self.use_naive_attn = False
         # NOTE: Allow for switching between Triton and CK. Defaulting to triton.
         # NOTE: Allow for switching between Triton and CK. Defaulting to triton.
-        self.use_triton_flash_attn = (os.environ.get(
-            "APHRODITE_USE_TRITON_FLASH_ATTN", "True").lower()
-                                      in ("true", "1"))
+        self.use_triton_flash_attn = envs.APHRODITE_USE_TRITON_FLASH_ATTN
         if self.use_triton_flash_attn:
         if self.use_triton_flash_attn:
             from aphrodite.attention.ops.triton_flash_attn import (  # noqa: F401
             from aphrodite.attention.ops.triton_flash_attn import (  # noqa: F401
                 triton_attention)
                 triton_attention)

+ 2 - 1
aphrodite/attention/selector.py

@@ -7,12 +7,13 @@ from typing import Generator, Optional, Type
 import torch
 import torch
 from loguru import logger
 from loguru import logger
 
 
+from aphrodite import envs
 from aphrodite.attention.backends.abstract import AttentionBackend
 from aphrodite.attention.backends.abstract import AttentionBackend
 from aphrodite.common.utils import (STR_BACKEND_ENV_VAR, is_cpu, is_hip,
 from aphrodite.common.utils import (STR_BACKEND_ENV_VAR, is_cpu, is_hip,
                                     is_openvino, is_xpu)
                                     is_openvino, is_xpu)
 from aphrodite.platforms import current_platform
 from aphrodite.platforms import current_platform
 
 
-APHRODITE_ATTENTION_BACKEND = os.getenv("APHRODITE_ATTENTION_BACKEND", None)
+APHRODITE_ATTENTION_BACKEND = envs.APHRODITE_ATTENTION_BACKEND
 
 
 
 
 class _Backend(enum.Enum):
 class _Backend(enum.Enum):

+ 34 - 16
aphrodite/common/config.py

@@ -9,6 +9,7 @@ import torch
 from loguru import logger
 from loguru import logger
 from transformers import PretrainedConfig
 from transformers import PretrainedConfig
 
 
+from aphrodite import envs
 from aphrodite.common.utils import (STR_NOT_IMPL_ENC_DEC_CUDAGRAPH, GiB_bytes,
 from aphrodite.common.utils import (STR_NOT_IMPL_ENC_DEC_CUDAGRAPH, GiB_bytes,
                                     cuda_device_count_stateless,
                                     cuda_device_count_stateless,
                                     get_cpu_memory, is_cpu, is_hip, is_neuron,
                                     get_cpu_memory, is_cpu, is_hip, is_neuron,
@@ -30,8 +31,7 @@ if TYPE_CHECKING:
         BaseTokenizerGroup)
         BaseTokenizerGroup)
 
 
 # If true, will load models from ModelScope instead of Hugging Face Hub.
 # If true, will load models from ModelScope instead of Hugging Face Hub.
-APHRODITE_USE_MODELSCOPE = os.environ.get("APHRODITE_USE_MODELSCOPE",
-                                          "False").lower() == "true"
+APHRODITE_USE_MODELSCOPE = envs.APHRODITE_USE_MODELSCOPE
 
 
 _EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768
 _EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768
 
 
@@ -1820,21 +1820,39 @@ def _get_and_verify_max_len(
                     "original_max_position_embeddings"]
                     "original_max_position_embeddings"]
             derived_max_model_len *= scaling_factor
             derived_max_model_len *= scaling_factor
 
 
+    # If the user specified a max length, make sure it is smaller than the
+    # derived length from the HF model config.
     if max_model_len is None:
     if max_model_len is None:
-        max_model_len = derived_max_model_len
-    elif max_model_len > derived_max_model_len and rope_scaling_arg is None:
-        raise ValueError(
-            f"User-specified max_model_len {max_model_len} is higher than "
-            f"the original {derived_max_model_len}. "
-            "Please provide a rope_scaling dict to scale the model.")
-    elif max_model_len > derived_max_model_len and rope_scaling_arg is not None:
-        # hope this works
-        logger.warning(
-            f"User-specified max_model_len {max_model_len} is higher than "
-            f"the original {derived_max_model_len}. "
-            "Attempting to use RoPE scaling with the provided rope_scaling "
-            "dict.")
-        derived_max_model_len = max_model_len
+        max_model_len = int(derived_max_model_len)
+    elif max_model_len > derived_max_model_len:
+        # Some models might have a separate key for specifying model_max_length
+        # that will be bigger than derived_max_model_len. We compare user input
+        # with model_max_length and allow this override when it's smaller.
+        model_max_length = getattr(hf_config, "model_max_length", None)
+        if envs.APHRODITE_DYNAMIC_ROPE_SCALING:
+            scaling_factor = max_model_len / derived_max_model_len
+            hf_config.rope_scaling = {"factor": scaling_factor,
+                                      "type": "dynamic"}
+            logger.info(
+                "Using dynamic RoPE scaling to extend the model's max context "
+                f"length from {derived_max_model_len} to {max_model_len}.")
+            derived_max_model_len = max_model_len
+        elif model_max_length is not None and max_model_len <= model_max_length:
+            if disable_sliding_window:
+                # TODO: Find a model that has model_max_length
+                # with sliding window to see if this case should be allowed.
+                raise NotImplementedError(
+                    "Disabling sliding window is not supported for models "
+                    "model_max_length in the config. Please raise an issue "
+                    "so we can investigate.")
+        else:
+            raise ValueError(
+                f"User-specified max_model_len ({max_model_len}) is greater "
+                f"than the derived max_model_len ({max_len_key}="
+                f"{derived_max_model_len} or model_max_length="
+                f"{model_max_length} in model's config.json). To allow "
+                "greater lengths, please set the env var "
+                "APHRODITE_DYNAMIC_ROPE_SCALING=1")
 
 
     return int(max_model_len)
     return int(max_model_len)
 
 

+ 3 - 2
aphrodite/common/logger.py

@@ -15,11 +15,12 @@ from rich.markup import escape
 from rich.progress import (BarColumn, MofNCompleteColumn, Progress,
 from rich.progress import (BarColumn, MofNCompleteColumn, Progress,
                            TaskProgressColumn, TextColumn, TimeRemainingColumn)
                            TaskProgressColumn, TextColumn, TimeRemainingColumn)
 
 
+from aphrodite import envs
+
 RICH_CONSOLE = Console()
 RICH_CONSOLE = Console()
 LOG_LEVEL = os.getenv("APHRODITE_LOG_LEVEL", "INFO").upper()
 LOG_LEVEL = os.getenv("APHRODITE_LOG_LEVEL", "INFO").upper()
 
 
-APHRODITE_CONFIGURE_LOGGING = int(os.getenv("APHRODITE_CONFIGURE_LOGGING",
-                                            "1"))
+APHRODITE_CONFIGURE_LOGGING = envs.APHRODITE_CONFIGURE_LOGGING
 
 
 
 
 def unwrap(wrapped, default=None):
 def unwrap(wrapped, default=None):

+ 3 - 3
aphrodite/common/sampling_params.py

@@ -1,6 +1,5 @@
 """Sampling parameters for text generation."""
 """Sampling parameters for text generation."""
 import copy
 import copy
-import os
 from enum import IntEnum
 from enum import IntEnum
 from functools import cached_property
 from functools import cached_property
 from typing import Any, Callable, Dict, List, Optional, Set, Union
 from typing import Any, Callable, Dict, List, Optional, Set, Union
@@ -10,11 +9,12 @@ import torch
 from loguru import logger
 from loguru import logger
 from typing_extensions import Annotated
 from typing_extensions import Annotated
 
 
+from aphrodite import envs
+
 _SAMPLING_EPS = 1e-5
 _SAMPLING_EPS = 1e-5
 _MAX_TEMP = 1e-2
 _MAX_TEMP = 1e-2
 
 
-APHRODITE_NO_DEPRECATION_WARNING = bool(
-    int(os.environ.get("APHRODITE_NO_DEPRECATION_WARNING", "0")))
+APHRODITE_NO_DEPRECATION_WARNING = envs.APHRODITE_NO_DEPRECATION_WARNING
 
 
 
 
 class SamplingType(IntEnum):
 class SamplingType(IntEnum):

+ 8 - 11
aphrodite/common/utils.py

@@ -31,6 +31,7 @@ from rich.progress import (BarColumn, MofNCompleteColumn, Progress,
                            SpinnerColumn, TextColumn, TimeElapsedColumn)
                            SpinnerColumn, TextColumn, TimeElapsedColumn)
 from typing_extensions import ParamSpec, TypeIs, assert_never
 from typing_extensions import ParamSpec, TypeIs, assert_never
 
 
+from aphrodite import envs
 from aphrodite.common.logger import enable_trace_function_call
 from aphrodite.common.logger import enable_trace_function_call
 from aphrodite.distributed import get_tensor_model_parallel_rank
 from aphrodite.distributed import get_tensor_model_parallel_rank
 
 
@@ -382,8 +383,7 @@ def get_aphrodite_instance_id():
     Instance id represents an instance of the Aphrodite. All processes in the
     Instance id represents an instance of the Aphrodite. All processes in the
     same instance should have the same instance id.
     same instance should have the same instance id.
     """
     """
-    return os.environ.get("APHRODITE_INSTANCE_ID",
-                          f"aphrodite-instance-{random_uuid()}")
+    return envs.APHRODITE_INSTANCE_ID or f"aphrodite-instance-{random_uuid()}"
 
 
 
 
 @lru_cache(maxsize=None)
 @lru_cache(maxsize=None)
@@ -520,9 +520,7 @@ def get_distributed_init_method(ip: str, port: int) -> str:
 
 
 def get_open_zmq_ipc_path() -> str:
 def get_open_zmq_ipc_path() -> str:
     if not in_windows():
     if not in_windows():
-        APHRODITE_RPC_BASE_PATH = os.getenv("APHRODITE_RPC_BASE_PATH",
-                                        tempfile.gettempdir())
-        base_rpc_path = APHRODITE_RPC_BASE_PATH
+        base_rpc_path = envs.APHRODITE_RPC_BASE_PATH
         return f"ipc://{base_rpc_path}/{uuid4()}"
         return f"ipc://{base_rpc_path}/{uuid4()}"
     else:
     else:
         # windows doesn't support ipc://
         # windows doesn't support ipc://
@@ -530,8 +528,7 @@ def get_open_zmq_ipc_path() -> str:
         return f"tcp://127.0.0.1:{get_open_port()}"
         return f"tcp://127.0.0.1:{get_open_port()}"
      
      
 def get_open_port(port: Optional[int] = None) -> int:
 def get_open_port(port: Optional[int] = None) -> int:
-    port = int(os.getenv("APHRODITE_PORT", 0)
-                ) if "APHRODITE_PORT" in os.environ else None
+    port = envs.APHRODITE_PORT
     if port is not None:
     if port is not None:
         while True:
         while True:
             try:
             try:
@@ -948,7 +945,7 @@ def find_library(lib_name: str) -> str:
     # libcuda.so.1 (libc6,x86-64) => /lib/x86_64-linux-gnu/libcuda.so.1
     # libcuda.so.1 (libc6,x86-64) => /lib/x86_64-linux-gnu/libcuda.so.1
     locs = [line.split()[-1] for line in libs.splitlines() if lib_name in line]
     locs = [line.split()[-1] for line in libs.splitlines() if lib_name in line]
     # `LD_LIBRARY_PATH` searches the library in the user-defined paths
     # `LD_LIBRARY_PATH` searches the library in the user-defined paths
-    env_ld_library_path = os.getenv("LD_LIBRARY_PATH")
+    env_ld_library_path = envs.LD_LIBRARY_PATH
     if not locs and env_ld_library_path:
     if not locs and env_ld_library_path:
         locs = [
         locs = [
             os.path.join(dir, lib_name)
             os.path.join(dir, lib_name)
@@ -967,7 +964,7 @@ def find_nccl_library() -> str:
     After importing `torch`, `libnccl.so.2` or `librccl.so.1` can be
     After importing `torch`, `libnccl.so.2` or `librccl.so.1` can be
     found by `ctypes` automatically.
     found by `ctypes` automatically.
     """
     """
-    so_file = os.environ.get("APHRODITE_NCCL_SO_PATH", "")
+    so_file = envs.APHRODITE_NCCL_SO_PATH
 
 
     # manually load the nccl library
     # manually load the nccl library
     if so_file:
     if so_file:
@@ -985,7 +982,7 @@ def find_nccl_library() -> str:
 
 
 
 
 def enable_trace_function_call_for_thread() -> None:
 def enable_trace_function_call_for_thread() -> None:
-    if int(os.getenv("APHRODITE_TRACE_FUNCTION", "0")):
+    if envs.APHRODITE_TRACE_FUNCTION:
         tmp_dir = tempfile.gettempdir()
         tmp_dir = tempfile.gettempdir()
         filename = (f"APHRODITE_TRACE_FUNCTION_for_process_{os.getpid()}"
         filename = (f"APHRODITE_TRACE_FUNCTION_for_process_{os.getpid()}"
                     f"_thread_{threading.get_ident()}_"
                     f"_thread_{threading.get_ident()}_"
@@ -1074,7 +1071,7 @@ def cuda_device_count_stateless() -> int:
     # This can be removed and simply replaced with torch.cuda.get_device_count
     # This can be removed and simply replaced with torch.cuda.get_device_count
     # after https://github.com/pytorch/pytorch/pull/122815 is released.
     # after https://github.com/pytorch/pytorch/pull/122815 is released.
 
 
-    return _cuda_device_count_stateless(os.environ.get("CUDA_VISIBLE_DEVICES"))
+    return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES)
 
 
 
 
 #From: https://stackoverflow.com/a/4104188/2749989
 #From: https://stackoverflow.com/a/4104188/2749989

+ 2 - 2
aphrodite/distributed/device_communicators/custom_all_reduce.py

@@ -1,4 +1,3 @@
-import os
 from contextlib import contextmanager
 from contextlib import contextmanager
 from typing import Any, List, Optional, Union
 from typing import Any, List, Optional, Union
 
 
@@ -8,6 +7,7 @@ from loguru import logger
 from torch.distributed import ProcessGroup
 from torch.distributed import ProcessGroup
 
 
 from aphrodite import _custom_ops as ops
 from aphrodite import _custom_ops as ops
+from aphrodite import envs
 from aphrodite.common.utils import cuda_device_count_stateless
 from aphrodite.common.utils import cuda_device_count_stateless
 from aphrodite.distributed.device_communicators.custom_all_reduce_utils import (
 from aphrodite.distributed.device_communicators.custom_all_reduce_utils import (
     gpu_p2p_access_check)
     gpu_p2p_access_check)
@@ -95,7 +95,7 @@ class CustomAllreduce:
         assert isinstance(device, torch.device)
         assert isinstance(device, torch.device)
         self.device = device
         self.device = device
 
 
-        cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
+        cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
         if cuda_visible_devices:
         if cuda_visible_devices:
             device_ids = list(map(int, cuda_visible_devices.split(",")))
             device_ids = list(map(int, cuda_visible_devices.split(",")))
         else:
         else:

+ 7 - 6
aphrodite/distributed/device_communicators/custom_all_reduce_utils.py

@@ -11,6 +11,7 @@ import torch.distributed as dist
 import torch.multiprocessing as mp
 import torch.multiprocessing as mp
 from loguru import logger
 from loguru import logger
 
 
+from aphrodite import envs
 from aphrodite.common.utils import (cuda_device_count_stateless,
 from aphrodite.common.utils import (cuda_device_count_stateless,
                                     update_environment_variables)
                                     update_environment_variables)
 from aphrodite.distributed.device_communicators.cuda_wrapper import (
 from aphrodite.distributed.device_communicators.cuda_wrapper import (
@@ -124,7 +125,7 @@ def can_actually_p2p(
     processes for testing all pairs of GPUs in batch. The trick is to reset
     processes for testing all pairs of GPUs in batch. The trick is to reset
     the device after each test (which is not available in PyTorch).
     the device after each test (which is not available in PyTorch).
     """  # noqa
     """  # noqa
-    cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None)
+    cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
     # pass the CUDA_VISIBLE_DEVICES to the child process
     # pass the CUDA_VISIBLE_DEVICES to the child process
     # to make sure they see the same set of GPUs
     # to make sure they see the same set of GPUs
 
 
@@ -183,13 +184,13 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
     is_distributed = dist.is_initialized()
     is_distributed = dist.is_initialized()
 
 
     num_dev = cuda_device_count_stateless()
     num_dev = cuda_device_count_stateless()
-    cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None)
+    cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
     if cuda_visible_devices is None:
     if cuda_visible_devices is None:
         cuda_visible_devices = ",".join(str(i) for i in range(num_dev))
         cuda_visible_devices = ",".join(str(i) for i in range(num_dev))
-    APHRODITE_CONFIG_ROOT = os.getenv("APHRODITE_CONFIG_ROOT", "~/.config")
-    path = os.path.expanduser(
-        f"{APHRODITE_CONFIG_ROOT}/aphrodite/gpu_p2p_access_cache_for_{cuda_visible_devices}.json"
-    )
+
+    path = os.path.join(
+        envs.APHRODITE_CACHE_ROOT,
+        f"gpu_p2p_access_cache_for_{cuda_visible_devices}.json")
     os.makedirs(os.path.dirname(path), exist_ok=True)
     os.makedirs(os.path.dirname(path), exist_ok=True)
     from aphrodite.distributed.parallel_state import get_world_group
     from aphrodite.distributed.parallel_state import get_world_group
     if ((not is_distributed or get_world_group().local_rank == 0)
     if ((not is_distributed or get_world_group().local_rank == 0)

+ 3 - 3
aphrodite/distributed/device_communicators/shm_broadcast.py

@@ -1,4 +1,3 @@
-import os
 import pickle
 import pickle
 import time
 import time
 from contextlib import contextmanager
 from contextlib import contextmanager
@@ -13,10 +12,11 @@ from loguru import logger
 from torch.distributed import ProcessGroup
 from torch.distributed import ProcessGroup
 from zmq import SUB, SUBSCRIBE, XPUB, XPUB_VERBOSE, Context  # type: ignore
 from zmq import SUB, SUBSCRIBE, XPUB, XPUB_VERBOSE, Context  # type: ignore
 
 
+from aphrodite import envs
 from aphrodite.common.utils import get_ip, get_open_port
 from aphrodite.common.utils import get_ip, get_open_port
 
 
-APHRODITE_RINGBUFFER_WARNING_INTERVAL = os.getenv(
-    "APHRODITE_RINGBUFFER_WARNING_INTERVAL", 60)
+APHRODITE_RINGBUFFER_WARNING_INTERVAL = (
+    envs.APHRODITE_RINGBUFFER_WARNING_INTERVAL)
 
 
 # time to wait if the queue is full or empty
 # time to wait if the queue is full or empty
 # if we sleep for too short, it will consume too much CPU
 # if we sleep for too short, it will consume too much CPU

+ 3 - 2
aphrodite/distributed/parallel_state.py

@@ -21,7 +21,6 @@ If you only need to use the distributed environment without model/pipeline
  steps.
  steps.
 """
 """
 import contextlib
 import contextlib
-import os
 import pickle
 import pickle
 import sys
 import sys
 from collections import namedtuple
 from collections import namedtuple
@@ -36,6 +35,8 @@ import torch.distributed
 from loguru import logger
 from loguru import logger
 from torch.distributed import Backend, ProcessGroup
 from torch.distributed import Backend, ProcessGroup
 
 
+from aphrodite import envs
+
 
 
 @dataclass
 @dataclass
 class GraphCaptureContext:
 class GraphCaptureContext:
@@ -866,7 +867,7 @@ def init_distributed_environment(
         # local rank not set, this usually happens in single-node
         # local rank not set, this usually happens in single-node
         # setting, where we can use rank as local rank
         # setting, where we can use rank as local rank
         if distributed_init_method == "env://":
         if distributed_init_method == "env://":
-            local_rank = os.getenv("LOCAL_RANK", rank)
+            local_rank = envs.LOCAL_RANK
         else:
         else:
             local_rank = rank
             local_rank = rank
     global _WORLD
     global _WORLD

+ 3 - 2
aphrodite/distributed/utils.py

@@ -3,12 +3,13 @@
 # Adapted from
 # Adapted from
 # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py
 # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py
 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-import os
 from typing import Sequence, Tuple
 from typing import Sequence, Tuple
 
 
 import torch
 import torch
 
 
-APHRODITE_PP_LAYER_PARTITION = os.getenv("APHRODITE_PP_LAYER_PARTITION", None)
+from aphrodite import envs
+
+APHRODITE_PP_LAYER_PARTITION = envs.APHRODITE_PP_LAYER_PARTITION
 
 
 
 
 def ensure_divisibility(numerator, denominator):
 def ensure_divisibility(numerator, denominator):

+ 2 - 1
aphrodite/endpoints/openai/api_server.py

@@ -20,6 +20,7 @@ from fastapi.responses import (HTMLResponse, JSONResponse, Response,
 from loguru import logger
 from loguru import logger
 from starlette.routing import Mount
 from starlette.routing import Mount
 
 
+from aphrodite import envs
 from aphrodite.common.config import ModelConfig
 from aphrodite.common.config import ModelConfig
 from aphrodite.common.outputs import RequestOutput
 from aphrodite.common.outputs import RequestOutput
 from aphrodite.common.sampling_params import _SAMPLING_EPS, SamplingParams
 from aphrodite.common.sampling_params import _SAMPLING_EPS, SamplingParams
@@ -635,7 +636,7 @@ def build_app(args: Namespace) -> FastAPI:
         return JSONResponse(err.model_dump(),
         return JSONResponse(err.model_dump(),
                             status_code=HTTPStatus.BAD_REQUEST)
                             status_code=HTTPStatus.BAD_REQUEST)
 
 
-    if token := os.environ.get("APHRODITE_API_KEY") or args.api_keys:
+    if token := envs.APHRODITE_API_KEY or args.api_keys:
         admin_key = os.environ.get("APHRODITE_ADMIN_KEY") or args.admin_key
         admin_key = os.environ.get("APHRODITE_ADMIN_KEY") or args.admin_key
 
 
         if admin_key is None:
         if admin_key is None:

+ 2 - 3
aphrodite/engine/aphrodite_engine.py

@@ -1,4 +1,3 @@
-import os
 import time
 import time
 from contextlib import contextmanager
 from contextlib import contextmanager
 from typing import TYPE_CHECKING, Any, ClassVar, Dict, Iterable, List, Optional
 from typing import TYPE_CHECKING, Any, ClassVar, Dict, Iterable, List, Optional
@@ -9,6 +8,7 @@ from loguru import logger
 from transformers import PreTrainedTokenizer
 from transformers import PreTrainedTokenizer
 from typing_extensions import assert_never
 from typing_extensions import assert_never
 
 
+from aphrodite import envs
 from aphrodite.common.config import (CacheConfig, DecodingConfig, DeviceConfig,
 from aphrodite.common.config import (CacheConfig, DecodingConfig, DeviceConfig,
                                      EngineConfig, LoadConfig, LoRAConfig,
                                      EngineConfig, LoadConfig, LoRAConfig,
                                      ModelConfig, ParallelConfig,
                                      ModelConfig, ParallelConfig,
@@ -50,8 +50,7 @@ from aphrodite.version import __version__ as APHRODITE_VERSION
 
 
 _LOCAL_LOGGING_INTERVAL_SEC = 5
 _LOCAL_LOGGING_INTERVAL_SEC = 5
 
 
-APHRODITE_USE_RAY_SPMD_WORKER = bool(
-    os.getenv("APHRODITE_USE_RAY_SPMD_WORKER", 0))
+APHRODITE_USE_RAY_SPMD_WORKER = envs.APHRODITE_USE_RAY_SPMD_WORKER
 
 
 
 
 def _load_generation_config_dict(model_config: ModelConfig) -> Dict[str, Any]:
 def _load_generation_config_dict(model_config: ModelConfig) -> Dict[str, Any]:

+ 2 - 3
aphrodite/engine/args_tools.py

@@ -1,13 +1,13 @@
 import argparse
 import argparse
 import dataclasses
 import dataclasses
 import json
 import json
-import os
 from dataclasses import dataclass
 from dataclasses import dataclass
 from typing import (TYPE_CHECKING, Dict, List, Mapping, Optional, Tuple, Type,
 from typing import (TYPE_CHECKING, Dict, List, Mapping, Optional, Tuple, Type,
                     Union)
                     Union)
 
 
 from loguru import logger
 from loguru import logger
 
 
+from aphrodite import envs
 from aphrodite.common.config import (CacheConfig, ConfigFormat, DecodingConfig,
 from aphrodite.common.config import (CacheConfig, ConfigFormat, DecodingConfig,
                                      DeviceConfig, EngineConfig, LoadConfig,
                                      DeviceConfig, EngineConfig, LoadConfig,
                                      LoadFormat, LoRAConfig, ModelConfig,
                                      LoadFormat, LoRAConfig, ModelConfig,
@@ -24,8 +24,7 @@ if TYPE_CHECKING:
     from aphrodite.transformers_utils.tokenizer_group import BaseTokenizerGroup
     from aphrodite.transformers_utils.tokenizer_group import BaseTokenizerGroup
 
 
 
 
-APHRODITE_USE_RAY_SPMD_WORKER = bool(
-    os.getenv("APHRODITE_USE_RAY_SPMD_WORKER", 0))
+APHRODITE_USE_RAY_SPMD_WORKER = envs.APHRODITE_USE_RAY_SPMD_WORKER
 
 
 def nullable_kvs(val: str) -> Optional[Mapping[str, int]]:
 def nullable_kvs(val: str) -> Optional[Mapping[str, int]]:
     if len(val) == 0:
     if len(val) == 0:

+ 2 - 3
aphrodite/engine/async_aphrodite.py

@@ -1,5 +1,4 @@
 import asyncio
 import asyncio
-import os
 import time
 import time
 from dataclasses import dataclass
 from dataclasses import dataclass
 from functools import partial
 from functools import partial
@@ -11,6 +10,7 @@ from loguru import logger
 from transformers import PreTrainedTokenizer
 from transformers import PreTrainedTokenizer
 from typing_extensions import assert_never
 from typing_extensions import assert_never
 
 
+from aphrodite import envs
 from aphrodite.common.config import (DecodingConfig, EngineConfig, LoRAConfig,
 from aphrodite.common.config import (DecodingConfig, EngineConfig, LoRAConfig,
                                      ModelConfig, ParallelConfig,
                                      ModelConfig, ParallelConfig,
                                      SchedulerConfig)
                                      SchedulerConfig)
@@ -34,8 +34,7 @@ from aphrodite.lora.request import LoRARequest
 from aphrodite.processing.scheduler import SchedulerOutputs
 from aphrodite.processing.scheduler import SchedulerOutputs
 from aphrodite.prompt_adapter.request import PromptAdapterRequest
 from aphrodite.prompt_adapter.request import PromptAdapterRequest
 
 
-ENGINE_ITERATION_TIMEOUT_S = int(
-    os.environ.get("APHRODITE_ENGINE_ITERATION_TIMEOUT_S", "60"))
+ENGINE_ITERATION_TIMEOUT_S = envs.APHRODITE_ENGINE_ITERATION_TIMEOUT_S
 
 
 
 
 class AsyncEngineDeadError(RuntimeError):
 class AsyncEngineDeadError(RuntimeError):

+ 388 - 0
aphrodite/envs.py

@@ -0,0 +1,388 @@
+import os
+import tempfile
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
+
+if TYPE_CHECKING:
+    APHRODITE_HOST_IP: str = ""
+    APHRODITE_PORT: Optional[int] = None
+    APHRODITE_RPC_BASE_PATH: str = tempfile.gettempdir()
+    APHRODITE_USE_MODELSCOPE: bool = False
+    APHRODITE_RINGBUFFER_WARNING_INTERVAL: int = 60
+    APHRODITE_INSTANCE_ID: Optional[str] = None
+    APHRODITE_NCCL_SO_PATH: Optional[str] = None
+    LD_LIBRARY_PATH: Optional[str] = None
+    APHRODITE_USE_TRITON_FLASH_ATTN: bool = False
+    LOCAL_RANK: int = 0
+    CUDA_VISIBLE_DEVICES: Optional[str] = None
+    APHRODITE_ENGINE_ITERATION_TIMEOUT_S: int = 60
+    APHRODITE_API_KEY: Optional[str] = None
+    S3_ACCESS_KEY_ID: Optional[str] = None
+    S3_SECRET_ACCESS_KEY: Optional[str] = None
+    S3_ENDPOINT_URL: Optional[str] = None
+    APHRODITE_CACHE_ROOT: str = os.path.expanduser("~/.cache/aphrodite")
+    APHRODITE_CONFIG_ROOT: str = os.path.expanduser("~/.config/aphrodite")
+    APHRODITE_CONFIGURE_LOGGING: int = 1
+    APHRODITE_LOGGING_LEVEL: str = "INFO"
+    APHRODITE_LOGGING_CONFIG_PATH: Optional[str] = None
+    APHRODITE_TRACE_FUNCTION: int = 0
+    APHRODITE_ATTENTION_BACKEND: Optional[str] = None
+    APHRODITE_USE_SAMPLING_KERNELS: bool = False
+    APHRODITE_PP_LAYER_PARTITION: Optional[str] = None
+    APHRODITE_CPU_KVCACHE_SPACE: int = 0
+    APHRODITE_CPU_OMP_THREADS_BIND: str = ""
+    APHRODITE_OPENVINO_KVCACHE_SPACE: int = 0
+    APHRODITE_OPENVINO_CPU_KV_CACHE_PRECISION: Optional[str] = None
+    APHRODITE_OPENVINO_ENABLE_QUANTIZED_WEIGHTS: bool = False
+    APHRODITE_XLA_CACHE_PATH: str = os.path.join(APHRODITE_CACHE_ROOT, "xla_cache")  # noqa: E501
+    APHRODITE_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024
+    APHRODITE_USE_RAY_SPMD_WORKER: bool = False
+    APHRODITE_USE_RAY_COMPILED_DAG: bool = False
+    APHRODITE_USE_RAY_COMPILED_DAG_NCCL_CHANNEL: bool = True
+    APHRODITE_WORKER_MULTIPROC_METHOD: str = "fork"
+    APHRODITE_ASSETS_CACHE: str = os.path.join(APHRODITE_CACHE_ROOT, "assets")
+    APHRODITE_IMAGE_FETCH_TIMEOUT: int = 5
+    APHRODITE_AUDIO_FETCH_TIMEOUT: int = 5
+    APHRODITE_TARGET_DEVICE: str = "cuda"
+    MAX_JOBS: Optional[str] = None
+    NVCC_THREADS: Optional[str] = None
+    APHRODITE_USE_PRECOMPILED: bool = False
+    APHRODITE_NO_DEPRECATION_WARNING: bool = False
+    APHRODITE_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
+    CMAKE_BUILD_TYPE: Optional[str] = None
+    VERBOSE: bool = False
+    APHRODITE_DYNAMIC_ROPE_SCALING: bool = False
+    APHRODITE_TEST_FORCE_FP8_MARLIN: bool = False
+    APHRODITE_ALLOW_ENGINE_USE_RAY: bool = False
+    APHRODITE_PLUGINS: Optional[List[str]] = None
+
+
+def get_default_cache_root():
+    return os.getenv(
+        "XDG_CACHE_HOME",
+        os.path.join(os.path.expanduser("~"), ".cache"),
+    )
+
+
+def get_default_config_root():
+    return os.getenv(
+        "XDG_CONFIG_HOME",
+        os.path.join(os.path.expanduser("~"), ".config"),
+    )
+
+
+# The begin-* and end* here are used by the documentation generator
+# to extract the used env vars.
+
+# begin-env-vars-definition
+
+environment_variables: Dict[str, Callable[[], Any]] = {
+
+    # ================== Installation Time Env Vars ==================
+
+    # Target device of Aphrodite, supporting [cuda (by default),
+    # rocm, neuron, cpu, openvino]
+    "APHRODITE_TARGET_DEVICE":
+    lambda: os.getenv("APHRODITE_TARGET_DEVICE", "cuda"),
+
+    # Maximum number of compilation jobs to run in parallel.
+    # By default this is the number of CPUs
+    "MAX_JOBS":
+    lambda: os.getenv("MAX_JOBS", None),
+
+    # Number of threads to use for nvcc
+    # By default this is 1.
+    # If set, `MAX_JOBS` will be reduced to avoid oversubscribing the CPU.
+    "NVCC_THREADS":
+    lambda: os.getenv("NVCC_THREADS", None),
+
+    # If set, Aphrodite will use precompiled binaries (*.so)
+    "APHRODITE_USE_PRECOMPILED":
+    lambda: bool(os.environ.get("APHRODITE_USE_PRECOMPILED")),
+
+    # CMake build type
+    # If not set, defaults to "Debug" or "RelWithDebInfo"
+    # Available options: "Debug", "Release", "RelWithDebInfo"
+    "CMAKE_BUILD_TYPE":
+    lambda: os.getenv("CMAKE_BUILD_TYPE"),
+
+    # If set, Aphrodite will print verbose logs during installation
+    "VERBOSE":
+    lambda: bool(int(os.getenv('VERBOSE', '0'))),
+
+    # Root directory for APHRODITE configuration files
+    # Defaults to `~/.config/aphrodite` unless `XDG_CONFIG_HOME` is set
+    # Note that this not only affects how aphrodite finds its configuration
+    # files during runtime, but also affects how aphrodite installs its
+    # configuration files during **installation**.
+    "APHRODITE_CONFIG_ROOT":
+    lambda: os.path.expanduser(
+        os.getenv(
+            "APHRODITE_CONFIG_ROOT",
+            os.path.join(get_default_config_root(), "aphrodite"),
+        )),
+
+    # ================== Runtime Env Vars ==================
+
+    # Root directory for APHRODITE cache files
+    # Defaults to `~/.cache/aphrodite` unless `XDG_CACHE_HOME` is set
+    "APHRODITE_CACHE_ROOT":
+    lambda: os.path.expanduser(
+        os.getenv(
+            "APHRODITE_CACHE_ROOT",
+            os.path.join(get_default_cache_root(), "aphrodite"),
+        )),
+
+    # used in distributed environment to determine the master address
+    'APHRODITE_HOST_IP':
+    lambda: os.getenv('APHRODITE_HOST_IP', "") or os.getenv("HOST_IP", ""),
+
+    # used in distributed environment to manually set the communication port
+    # Note: if APHRODITE_PORT is set, and some code asks for multiple ports, the
+    # APHRODITE_PORT will be used as the first port, and the rest will be
+    # generated by incrementing the APHRODITE_PORT value.
+    # '0' is used to make mypy happy
+    'APHRODITE_PORT':
+    lambda: int(os.getenv('APHRODITE_PORT', '0'))
+    if 'APHRODITE_PORT' in os.environ else None,
+
+    # path used for ipc when the frontend api server is running in
+    # multi-processing mode to communicate with the backend engine process.
+    'APHRODITE_RPC_BASE_PATH':
+    lambda: os.getenv('APHRODITE_RPC_BASE_PATH', tempfile.gettempdir()),
+
+    # If true, will load models from ModelScope instead of Hugging Face Hub.
+    # note that the value is true or false, not numbers
+    "APHRODITE_USE_MODELSCOPE":
+    lambda: os.environ.get(
+        "APHRODITE_USE_MODELSCOPE", "False").lower() == "true",
+
+    # Instance id represents an instance of the APHRODITE. All processes in the
+    # same instance should have the same instance id.
+    "APHRODITE_INSTANCE_ID":
+    lambda: os.environ.get("APHRODITE_INSTANCE_ID", None),
+
+    # Interval in seconds to log a warning message when the ring buffer is full
+    "APHRODITE_RINGBUFFER_WARNING_INTERVAL":
+    lambda: int(os.environ.get("APHRODITE_RINGBUFFER_WARNING_INTERVAL", "60")),
+
+    # path to cudatoolkit home directory, under which should be bin, include,
+    # and lib directories.
+    "CUDA_HOME":
+    lambda: os.environ.get("CUDA_HOME", None),
+
+    # Path to the NCCL library file. It is needed because nccl>=2.19 brought
+    # by PyTorch contains a bug: https://github.com/NVIDIA/nccl/issues/1234
+    "APHRODITE_NCCL_SO_PATH":
+    lambda: os.environ.get("APHRODITE_NCCL_SO_PATH", None),
+
+    # when `APHRODITE_NCCL_SO_PATH` is not set, aphrodite will try to find the
+    # nccl library file in the locations specified by `LD_LIBRARY_PATH`
+    "LD_LIBRARY_PATH":
+    lambda: os.environ.get("LD_LIBRARY_PATH", None),
+
+    # flag to control if aphrodite should use triton flash attention
+    "APHRODITE_USE_TRITON_FLASH_ATTN":
+    lambda: (os.environ.get(
+        "APHRODITE_USE_TRITON_FLASH_ATTN", "True").lower() in ("true", "1")),
+
+    # Internal flag to enable Dynamo graph capture
+    "APHRODITE_TEST_DYNAMO_GRAPH_CAPTURE":
+    lambda: int(os.environ.get("APHRODITE_TEST_DYNAMO_GRAPH_CAPTURE", "0")),
+
+    # local rank of the process in the distributed setting, used to determine
+    # the GPU device id
+    "LOCAL_RANK":
+    lambda: int(os.environ.get("LOCAL_RANK", "0")),
+
+    # used to control the visible devices in the distributed setting
+    "CUDA_VISIBLE_DEVICES":
+    lambda: os.environ.get("CUDA_VISIBLE_DEVICES", None),
+
+    # timeout for each iteration in the engine
+    "APHRODITE_ENGINE_ITERATION_TIMEOUT_S":
+    lambda: int(os.environ.get("APHRODITE_ENGINE_ITERATION_TIMEOUT_S", "60")),
+
+    # API key for APHRODITE API server
+    "APHRODITE_API_KEY":
+    lambda: os.environ.get("APHRODITE_API_KEY", None),
+
+    # S3 access information, used for tensorizer to load model from S3
+    "S3_ACCESS_KEY_ID":
+    lambda: os.environ.get("S3_ACCESS_KEY_ID", None),
+    "S3_SECRET_ACCESS_KEY":
+    lambda: os.environ.get("S3_SECRET_ACCESS_KEY", None),
+    "S3_ENDPOINT_URL":
+    lambda: os.environ.get("S3_ENDPOINT_URL", None),
+
+    # Logging configuration
+    # If set to 0, aphrodite will not configure logging
+    # If set to 1, aphrodite will configure logging using the default
+    # configuration or the configuration file specified by
+    # APHRODITE_LOGGING_CONFIG_PATH
+    "APHRODITE_CONFIGURE_LOGGING":
+    lambda: int(os.getenv("APHRODITE_CONFIGURE_LOGGING", "1")),
+    "APHRODITE_LOGGING_CONFIG_PATH":
+    lambda: os.getenv("APHRODITE_LOGGING_CONFIG_PATH"),
+
+    # this is used for configuring the default logging level
+    "APHRODITE_LOGGING_LEVEL":
+    lambda: os.getenv("APHRODITE_LOGGING_LEVEL", "INFO"),
+
+    # Trace function calls
+    # If set to 1, aphrodite will trace function calls
+    # Useful for debugging
+    "APHRODITE_TRACE_FUNCTION":
+    lambda: int(os.getenv("APHRODITE_TRACE_FUNCTION", "0")),
+
+    # Backend for attention computation
+    # Available options:
+    # - "TORCH_SDPA": use torch.nn.MultiheadAttention
+    # - "FLASH_ATTN": use FlashAttention
+    # - "XFORMERS": use XFormers
+    # - "ROCM_FLASH": use ROCmFlashAttention
+    # - "FLASHINFER": use flashinfer
+    "APHRODITE_ATTENTION_BACKEND":
+    lambda: os.getenv("APHRODITE_ATTENTION_BACKEND", None),
+
+    # If set, aphrodite will use flashinfer sampler
+    "APHRODITE_USE_SAMPLING_KERNELS":
+    lambda: bool(int(os.getenv("APHRODITE_USE_SAMPLING_KERNELS", "0"))),
+
+    # Pipeline stage partition strategy
+    "APHRODITE_PP_LAYER_PARTITION":
+    lambda: os.getenv("APHRODITE_PP_LAYER_PARTITION", None),
+
+    # (CPU backend only) CPU key-value cache space.
+    # default is 4GB
+    "APHRODITE_CPU_KVCACHE_SPACE":
+    lambda: int(os.getenv("APHRODITE_CPU_KVCACHE_SPACE", "0")),
+
+    # (CPU backend only) CPU core ids bound by OpenMP threads, e.g., "0-31",
+    # "0,1,2", "0-31,33". CPU cores of different ranks are separated by '|'.
+    "APHRODITE_CPU_OMP_THREADS_BIND":
+    lambda: os.getenv("APHRODITE_CPU_OMP_THREADS_BIND", "all"),
+
+    # OpenVINO key-value cache space
+    # default is 4GB
+    "APHRODITE_OPENVINO_KVCACHE_SPACE":
+    lambda: int(os.getenv("APHRODITE_OPENVINO_KVCACHE_SPACE", "0")),
+
+    # OpenVINO KV cache precision
+    # default is bf16 if natively supported by platform, otherwise f16
+    # To enable KV cache compression, please, explicitly specify u8
+    "APHRODITE_OPENVINO_CPU_KV_CACHE_PRECISION":
+    lambda: os.getenv("APHRODITE_OPENVINO_CPU_KV_CACHE_PRECISION", None),
+
+    # Enables weights compression during model export via HF Optimum
+    # default is False
+    "APHRODITE_OPENVINO_ENABLE_QUANTIZED_WEIGHTS":
+    lambda: bool(os.getenv(
+        "APHRODITE_OPENVINO_ENABLE_QUANTIZED_WEIGHTS", False)),
+
+    # If the env var is set, then all workers will execute as separate
+    # processes from the engine, and we use the same mechanism to trigger
+    # execution on all workers.
+    # Run aphrodite with APHRODITE_USE_RAY_SPMD_WORKER=1 to enable it.
+    "APHRODITE_USE_RAY_SPMD_WORKER":
+    lambda: bool(int(os.getenv("APHRODITE_USE_RAY_SPMD_WORKER", "0"))),
+
+    # If the env var is set, it uses the Ray's compiled DAG API
+    # which optimizes the control plane overhead.
+    # Run aphrodite with APHRODITE_USE_RAY_COMPILED_DAG=1 to enable it.
+    "APHRODITE_USE_RAY_COMPILED_DAG":
+    lambda: bool(int(os.getenv("APHRODITE_USE_RAY_COMPILED_DAG", "0"))),
+
+    # If the env var is set, it uses NCCL for communication in
+    # Ray's compiled DAG. This flag is ignored if
+    # APHRODITE_USE_RAY_COMPILED_DAG is not set.
+    "APHRODITE_USE_RAY_COMPILED_DAG_NCCL_CHANNEL":
+    lambda: bool(int(
+        os.getenv("APHRODITE_USE_RAY_COMPILED_DAG_NCCL_CHANNEL", "1"))),
+
+    # Use dedicated multiprocess context for workers.
+    # Both spawn and fork work
+    "APHRODITE_WORKER_MULTIPROC_METHOD":
+    lambda: os.getenv("APHRODITE_WORKER_MULTIPROC_METHOD", "fork"),
+
+    # Path to the cache for storing downloaded assets
+    "APHRODITE_ASSETS_CACHE":
+    lambda: os.path.expanduser(
+        os.getenv(
+            "APHRODITE_ASSETS_CACHE",
+            os.path.join(get_default_cache_root(), "aphrodite", "assets"),
+        )),
+
+    # Timeout for fetching images when serving multimodal models
+    # Default is 5 seconds
+    "APHRODITE_IMAGE_FETCH_TIMEOUT":
+    lambda: int(os.getenv("APHRODITE_IMAGE_FETCH_TIMEOUT", "5")),
+
+    # Timeout for fetching audio when serving multimodal models
+    # Default is 5 seconds
+    "APHRODITE_AUDIO_FETCH_TIMEOUT":
+    lambda: int(os.getenv("APHRODITE_AUDIO_FETCH_TIMEOUT", "5")),
+
+    # Path to the XLA persistent cache directory.
+    # Only used for XLA devices such as TPUs.
+    "APHRODITE_XLA_CACHE_PATH":
+    lambda: os.path.expanduser(
+        os.getenv(
+            "APHRODITE_XLA_CACHE_PATH",
+            os.path.join(get_default_cache_root(), "aphrodite", "xla_cache"),
+        )),
+    "APHRODITE_FUSED_MOE_CHUNK_SIZE":
+    lambda: int(os.getenv("APHRODITE_FUSED_MOE_CHUNK_SIZE", "65536")),
+
+    # If set, aphrodite will skip the deprecation warnings.
+    "APHRODITE_NO_DEPRECATION_WARNING":
+    lambda: bool(int(os.getenv("APHRODITE_NO_DEPRECATION_WARNING", "0"))),
+
+    # If set, the OpenAI API server will stay alive even after the underlying
+    # AsyncLLMEngine errors and stops serving requests
+    "APHRODITE_KEEP_ALIVE_ON_ENGINE_DEATH":
+    lambda: bool(os.getenv("APHRODITE_KEEP_ALIVE_ON_ENGINE_DEATH", 0)),
+
+    # If the env var APHRODITE_DYNAMIC_ROPE_SCALING is set, it allows
+    # the user to specify a max sequence length greater than
+    # the max length derived from the model's config.json.
+    # To enable this, set APHRODITE_DYNAMIC_ROPE_SCALING=1.
+    "APHRODITE_DYNAMIC_ROPE_SCALING":
+    lambda:
+    (os.environ.get(
+        "APHRODITE_DYNAMIC_ROPE_SCALING",
+        "0").strip().lower() in ("1", "true")),
+
+    # If set, forces FP8 Marlin to be used for FP8 quantization regardless
+    # of the hardware support for FP8 compute.
+    "APHRODITE_TEST_FORCE_FP8_MARLIN":
+    lambda:
+    (os.environ.get("APHRODITE_TEST_FORCE_FP8_MARLIN", "0").strip().lower() in
+     ("1", "true")),
+
+    # If set, allow running the engine as a separate ray actor,
+    # which is a deprecated feature soon to be removed.
+    "APHRODITE_ALLOW_ENGINE_USE_RAY":
+    lambda:
+    (os.environ.get("APHRODITE_ALLOW_ENGINE_USE_RAY", "0").strip().lower() in
+     ("1", "true")),
+
+    # a list of plugin names to load, separated by commas.
+    # if this is not set, it means all plugins will be loaded
+    # if this is set to an empty string, no plugins will be loaded
+    "APHRODITE_PLUGINS":
+    lambda: None if "APHRODITE_PLUGINS" not in os.environ else os.environ[
+        "APHRODITE_PLUGINS"].split(","),
+}
+
+# end-env-vars-definition
+
+
+def __getattr__(name: str):
+    # lazy evaluation of environment variables
+    if name in environment_variables:
+        return environment_variables[name]()
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+
+
+def __dir__():
+    return list(environment_variables.keys())

+ 2 - 1
aphrodite/executor/cpu_executor.py

@@ -5,6 +5,7 @@ from typing import Any, Awaitable, List, Optional, Set, Tuple, Union
 import torch
 import torch
 from loguru import logger
 from loguru import logger
 
 
+from aphrodite import envs
 from aphrodite.common.config import CacheConfig, ModelConfig, SchedulerConfig
 from aphrodite.common.config import CacheConfig, ModelConfig, SchedulerConfig
 from aphrodite.common.sequence import ExecuteModelRequest, SamplerOutput
 from aphrodite.common.sequence import ExecuteModelRequest, SamplerOutput
 from aphrodite.common.utils import (GiB_bytes, get_aphrodite_instance_id,
 from aphrodite.common.utils import (GiB_bytes, get_aphrodite_instance_id,
@@ -333,7 +334,7 @@ def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig:
         logger.warning("Prefix caching is not supported on CPU, disable it.")
         logger.warning("Prefix caching is not supported on CPU, disable it.")
         config.enable_prefix_caching = False
         config.enable_prefix_caching = False
 
 
-    kv_cache_space_str = os.getenv("APHRODITE_CPU_KVCACHE_SPACE", "0")
+    kv_cache_space_str = envs.APHRODITE_CPU_KVCACHE_SPACE
     kv_cache_space = int(kv_cache_space_str)
     kv_cache_space = int(kv_cache_space_str)
 
 
     if kv_cache_space >= 0:
     if kv_cache_space >= 0:

+ 3 - 1
aphrodite/executor/multiproc_worker_utils.py

@@ -14,6 +14,8 @@ from typing import (Any, Callable, Dict, Generic, List, Optional, TextIO,
 
 
 from loguru import logger
 from loguru import logger
 
 
+from aphrodite import envs
+
 T = TypeVar('T')
 T = TypeVar('T')
 
 
 _TERMINATE = "TERMINATE"  # sentinel
 _TERMINATE = "TERMINATE"  # sentinel
@@ -26,7 +28,7 @@ JOIN_TIMEOUT_S = 2
 
 
 # Use dedicated multiprocess context for workers.
 # Use dedicated multiprocess context for workers.
 # Both spawn and fork work
 # Both spawn and fork work
-mp_method = os.getenv("APHRODITE_WORKER_MULTIPROC_METHOD", "fork")
+mp_method = envs.APHRODITE_WORKER_MULTIPROC_METHOD
 mp = multiprocessing.get_context(mp_method)
 mp = multiprocessing.get_context(mp_method)
 
 
 
 

+ 4 - 5
aphrodite/executor/openvino_executor.py

@@ -1,4 +1,3 @@
-import os
 from typing import List, Set, Tuple
 from typing import List, Set, Tuple
 
 
 import openvino as ov
 import openvino as ov
@@ -6,6 +5,7 @@ import openvino.properties.hint as hints
 import torch
 import torch
 from loguru import logger
 from loguru import logger
 
 
+from aphrodite import envs
 from aphrodite.common.config import CacheConfig, ModelConfig
 from aphrodite.common.config import CacheConfig, ModelConfig
 from aphrodite.common.sequence import ExecuteModelRequest, SamplerOutput
 from aphrodite.common.sequence import ExecuteModelRequest, SamplerOutput
 from aphrodite.common.utils import (GiB_bytes, get_distributed_init_method,
 from aphrodite.common.utils import (GiB_bytes, get_distributed_init_method,
@@ -13,10 +13,9 @@ from aphrodite.common.utils import (GiB_bytes, get_distributed_init_method,
 from aphrodite.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from aphrodite.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from aphrodite.lora.request import LoRARequest
 from aphrodite.lora.request import LoRARequest
 
 
-APHRODITE_OPENVINO_KVCACHE_SPACE = int(
-    os.getenv("APHRODITE_OPENVINO_KVCACHE_SPACE", 0))
-APHRODITE_OPENVINO_CPU_KV_CACHE_PRECISION = os.getenv(
-    "APHRODITE_OPENVINO_CPU_KV_CACHE_PRECISION", None)
+APHRODITE_OPENVINO_KVCACHE_SPACE = envs.APHRODITE_OPENVINO_KVCACHE_SPACE
+APHRODITE_OPENVINO_CPU_KV_CACHE_PRECISION = (
+    envs.APHRODITE_OPENVINO_CPU_KV_CACHE_PRECISION)
 
 
 
 
 class OpenVINOExecutor(ExecutorBase):
 class OpenVINOExecutor(ExecutorBase):

+ 7 - 8
aphrodite/executor/ray_gpu_executor.py

@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
 import msgspec
 import msgspec
 from loguru import logger
 from loguru import logger
 
 
+from aphrodite import envs
 from aphrodite.common.sequence import ExecuteModelRequest, SamplerOutput
 from aphrodite.common.sequence import ExecuteModelRequest, SamplerOutput
 from aphrodite.common.utils import (_run_task_with_lock,
 from aphrodite.common.utils import (_run_task_with_lock,
                                     get_aphrodite_instance_id,
                                     get_aphrodite_instance_id,
@@ -26,14 +27,12 @@ if TYPE_CHECKING:
 # If the env var is set, it uses the Ray's compiled DAG API
 # If the env var is set, it uses the Ray's compiled DAG API
 # which optimizes the control plane overhead.
 # which optimizes the control plane overhead.
 # Run Aphrodite with APHRODITE_USE_RAY_COMPILED_DAG=1 to enable it.
 # Run Aphrodite with APHRODITE_USE_RAY_COMPILED_DAG=1 to enable it.
-APHRODITE_USE_RAY_COMPILED_DAG = bool(
-    os.getenv("APHRODITE_USE_RAY_COMPILED_DAG", 0))
-APHRODITE_TRACE_FUNCTION = int(os.getenv("APHRODITE_TRACE_FUNCTION", 0))
-APHRODITE_USE_RAY_SPMD_WORKER = bool(
-    os.getenv("APHRODITE_USE_RAY_SPMD_WORKER", 0))
-
-APHRODITE_USE_RAY_COMPILED_DAG_NCCL_CHANNEL = bool(
-    int(os.getenv("APHRODITE_USE_RAY_COMPILED_DAG_NCCL_CHANNEL", 1)))
+APHRODITE_USE_RAY_COMPILED_DAG = envs.APHRODITE_USE_RAY_COMPILED_DAG
+APHRODITE_TRACE_FUNCTION = envs.APHRODITE_TRACE_FUNCTION
+APHRODITE_USE_RAY_SPMD_WORKER = envs.APHRODITE_USE_RAY_SPMD_WORKER
+
+APHRODITE_USE_RAY_COMPILED_DAG_NCCL_CHANNEL = (
+    envs.APHRODITE_USE_RAY_COMPILED_DAG_NCCL_CHANNEL)
 
 
 
 
 class RayGPUExecutor(DistributedGPUExecutor):
 class RayGPUExecutor(DistributedGPUExecutor):

+ 2 - 1
aphrodite/executor/ray_tpu_executor.py

@@ -7,6 +7,7 @@ from typing import (TYPE_CHECKING, Any, Awaitable, Dict, List, Optional, Tuple,
 
 
 from loguru import logger
 from loguru import logger
 
 
+from aphrodite import envs
 from aphrodite.common.sequence import ExecuteModelRequest, SamplerOutput
 from aphrodite.common.sequence import ExecuteModelRequest, SamplerOutput
 from aphrodite.common.utils import (get_aphrodite_instance_id,
 from aphrodite.common.utils import (get_aphrodite_instance_id,
                                     get_distributed_init_method, get_ip,
                                     get_distributed_init_method, get_ip,
@@ -21,7 +22,7 @@ if ray is not None:
 if TYPE_CHECKING:
 if TYPE_CHECKING:
     from ray.util.placement_group import PlacementGroup
     from ray.util.placement_group import PlacementGroup
 
 
-APHRODITE_TRACE_FUNCTION = int(os.getenv("APHRODITE_TRACE_FUNCTION", 0))
+APHRODITE_TRACE_FUNCTION = envs.APHRODITE_TRACE_FUNCTION
 
 
 
 
 class RayTPUExecutor(TPUExecutor):
 class RayTPUExecutor(TPUExecutor):

+ 2 - 1
aphrodite/executor/ray_xpu_executor.py

@@ -7,6 +7,7 @@ from typing import (TYPE_CHECKING, Any, Awaitable, Dict, List, Optional, Set,
 
 
 from loguru import logger
 from loguru import logger
 
 
+from aphrodite import envs
 from aphrodite.common.config import (CacheConfig, DeviceConfig, LoadConfig,
 from aphrodite.common.config import (CacheConfig, DeviceConfig, LoadConfig,
                                      LoRAConfig, ModelConfig, ParallelConfig,
                                      LoRAConfig, ModelConfig, ParallelConfig,
                                      PromptAdapterConfig, SchedulerConfig,
                                      PromptAdapterConfig, SchedulerConfig,
@@ -28,7 +29,7 @@ if TYPE_CHECKING:
 # If the env var is set, it uses the Ray's compiled DAG API
 # If the env var is set, it uses the Ray's compiled DAG API
 # which optimizes the control plane overhead.
 # which optimizes the control plane overhead.
 # Run Aphrodite with APHRODITE_USE_RAY_COMPILED_DAG=1 to enable it.
 # Run Aphrodite with APHRODITE_USE_RAY_COMPILED_DAG=1 to enable it.
-USE_RAY_COMPILED_DAG = bool(os.getenv("APHRODITE_USE_RAY_COMPILED_DAG", 0))
+USE_RAY_COMPILED_DAG = envs.APHRODITE_USE_RAY_COMPILED_DAG
 
 
 
 
 class RayXPUExecutor(DistributedGPUExecutor):
 class RayXPUExecutor(DistributedGPUExecutor):

+ 2 - 2
aphrodite/modeling/layers/fused_moe/fused_moe.py

@@ -10,10 +10,10 @@ import triton.language as tl
 from loguru import logger
 from loguru import logger
 
 
 from aphrodite import _custom_ops as ops
 from aphrodite import _custom_ops as ops
+from aphrodite import envs
 from aphrodite.platforms import current_platform
 from aphrodite.platforms import current_platform
 
 
-APHRODITE_FUSED_MOE_CHUNK_SIZE = int(
-    os.getenv("APHRODITE_FUSED_MOE_CHUNK_SIZE", "65536"))
+APHRODITE_FUSED_MOE_CHUNK_SIZE = envs.APHRODITE_FUSED_MOE_CHUNK_SIZE
 
 
 
 
 @triton.jit
 @triton.jit

+ 2 - 3
aphrodite/modeling/layers/sampler.py

@@ -1,6 +1,5 @@
 """A layer that samples the next tokens from the model's outputs."""
 """A layer that samples the next tokens from the model's outputs."""
 import itertools
 import itertools
-import os
 import warnings
 import warnings
 from enum import IntEnum
 from enum import IntEnum
 from math import inf
 from math import inf
@@ -11,6 +10,7 @@ import torch.nn as nn
 from loguru import logger
 from loguru import logger
 
 
 import aphrodite._custom_ops as ops
 import aphrodite._custom_ops as ops
+from aphrodite import envs
 from aphrodite.common.sampling_params import SamplingType
 from aphrodite.common.sampling_params import SamplingType
 from aphrodite.common.sequence import (CompletionSequenceGroupOutput, Logprob,
 from aphrodite.common.sequence import (CompletionSequenceGroupOutput, Logprob,
                                        PromptLogprobs, SampleLogprobs,
                                        PromptLogprobs, SampleLogprobs,
@@ -34,8 +34,7 @@ _TEMPERATURE_MINIMUM = 2e-5
 
 
 # If enabled, we switch to a more performant implementation
 # If enabled, we switch to a more performant implementation
 # of top-k and top-p
 # of top-k and top-p
-APHRODITE_USE_SAMPLING_KERNELS = bool(int(
-    os.getenv("APHRODITE_USE_SAMPLING_KERNELS", "0")))
+APHRODITE_USE_SAMPLING_KERNELS = envs.APHRODITE_USE_SAMPLING_KERNELS
 
 
 
 
 class SamplerID(IntEnum):
 class SamplerID(IntEnum):

+ 3 - 3
aphrodite/modeling/model_loader/openvino.py

@@ -1,5 +1,4 @@
 # ruff: noqa: SIM117
 # ruff: noqa: SIM117
-import os
 from pathlib import Path
 from pathlib import Path
 from typing import List, Optional, Tuple
 from typing import List, Optional, Tuple
 
 
@@ -11,6 +10,7 @@ from openvino._offline_transformations import paged_attention_transformation
 from optimum.intel import OVModelForCausalLM
 from optimum.intel import OVModelForCausalLM
 from torch import nn
 from torch import nn
 
 
+from aphrodite import envs
 from aphrodite.attention.backends.openvino import OpenVINOAttentionMetadata
 from aphrodite.attention.backends.openvino import OpenVINOAttentionMetadata
 from aphrodite.common.config import DeviceConfig, ModelConfig
 from aphrodite.common.config import DeviceConfig, ModelConfig
 from aphrodite.common.sequence import SamplerOutput
 from aphrodite.common.sequence import SamplerOutput
@@ -19,8 +19,8 @@ from aphrodite.modeling.layers.logits_processor import (LogitsProcessor,
 from aphrodite.modeling.layers.sampler import Sampler
 from aphrodite.modeling.layers.sampler import Sampler
 from aphrodite.modeling.sampling_metadata import SamplingMetadata
 from aphrodite.modeling.sampling_metadata import SamplingMetadata
 
 
-APHRODITE_OPENVINO_ENABLE_QUANTIZED_WEIGHTS = bool(
-    os.getenv("APHRODITE_OPENVINO_ENABLE_QUANTIZED_WEIGHTS", False))
+APHRODITE_OPENVINO_ENABLE_QUANTIZED_WEIGHTS = (
+    envs.APHRODITE_OPENVINO_ENABLE_QUANTIZED_WEIGHTS)
 
 
 
 
 def _flattenize_inputs(inputs):
 def _flattenize_inputs(inputs):

+ 4 - 3
aphrodite/modeling/model_loader/tensorizer.py

@@ -13,6 +13,7 @@ from loguru import logger
 from torch import nn
 from torch import nn
 from transformers import PretrainedConfig
 from transformers import PretrainedConfig
 
 
+from aphrodite import envs
 from aphrodite.common.config import ModelConfig, ParallelConfig
 from aphrodite.common.config import ModelConfig, ParallelConfig
 from aphrodite.engine.aphrodite_engine import AphroditeEngine
 from aphrodite.engine.aphrodite_engine import AphroditeEngine
 from aphrodite.engine.args_tools import EngineArgs
 from aphrodite.engine.args_tools import EngineArgs
@@ -148,12 +149,12 @@ class TensorizerArgs:
     def __post_init__(self):
     def __post_init__(self):
         self.file_obj = self.tensorizer_uri
         self.file_obj = self.tensorizer_uri
         self.s3_access_key_id = (self.s3_access_key_id
         self.s3_access_key_id = (self.s3_access_key_id
-                                 or os.environ.get("S3_ACCESS_KEY_ID")) or None
+                                 or envs.S3_ACCESS_KEY_ID) or None
         self.s3_secret_access_key = (
         self.s3_secret_access_key = (
             self.s3_secret_access_key
             self.s3_secret_access_key
-            or os.environ.get("S3_SECRET_ACCESS_KEY")) or None
+            or envs.S3_SECRET_ACCESS_KEY) or None
         self.s3_endpoint = (self.s3_endpoint
         self.s3_endpoint = (self.s3_endpoint
-                            or os.environ.get("S3_ENDPOINT_URL")) or None
+                            or envs.S3_ENDPOINT_URL) or None
         self.stream_params = {
         self.stream_params = {
             "s3_access_key_id": self.s3_access_key_id,
             "s3_access_key_id": self.s3_access_key_id,
             "s3_secret_access_key": self.s3_secret_access_key,
             "s3_secret_access_key": self.s3_secret_access_key,

+ 3 - 5
aphrodite/multimodal/utils.py

@@ -1,5 +1,4 @@
 import base64
 import base64
-import os
 from io import BytesIO
 from io import BytesIO
 from typing import Tuple, Union
 from typing import Tuple, Union
 
 
@@ -8,14 +7,13 @@ import numpy as np
 import soundfile
 import soundfile
 from PIL import Image
 from PIL import Image
 
 
+from aphrodite import envs
 from aphrodite.common.connections import global_http_connection
 from aphrodite.common.connections import global_http_connection
 from aphrodite.multimodal.base import MultiModalDataDict
 from aphrodite.multimodal.base import MultiModalDataDict
 
 
-APHRODITE_IMAGE_FETCH_TIMEOUT = int(
-    os.getenv("APHRODITE_IMAGE_FETCH_TIMEOUT", 10))
+APHRODITE_IMAGE_FETCH_TIMEOUT = envs.APHRODITE_IMAGE_FETCH_TIMEOUT
 
 
-APHRODITE_AUDIO_FETCH_TIMEOUT = int(
-    os.getenv("APHRODITE_AUDIO_FETCH_TIMEOUT", 10))
+APHRODITE_AUDIO_FETCH_TIMEOUT = envs.APHRODITE_AUDIO_FETCH_TIMEOUT
 
 
 
 
 def _load_image_from_bytes(b: bytes):
 def _load_image_from_bytes(b: bytes):

+ 3 - 5
aphrodite/plugins/__init__.py

@@ -1,9 +1,7 @@
-import os
-
 from loguru import logger
 from loguru import logger
 
 
-APHRODITE_PLUGINS = None if "APHRODITE_PLUGINS" not in os.environ else \
-    os.environ["APHRODITE_PLUGINS"].split(",")
+from aphrodite import envs
+
 
 
 def load_general_plugins():
 def load_general_plugins():
     """WARNING: plugins can be loaded for multiple times in different
     """WARNING: plugins can be loaded for multiple times in different
@@ -16,7 +14,7 @@ def load_general_plugins():
     else:
     else:
         from importlib.metadata import entry_points
         from importlib.metadata import entry_points
 
 
-    allowed_plugins = APHRODITE_PLUGINS
+    allowed_plugins = envs.APHRODITE_PLUGINS
 
 
     discovered_plugins = entry_points(group='aphrodite.general_plugins')
     discovered_plugins = entry_points(group='aphrodite.general_plugins')
     for plugin in discovered_plugins:
     for plugin in discovered_plugins:

+ 2 - 3
aphrodite/quantization/fp8.py

@@ -1,4 +1,3 @@
-import os
 from typing import Any, Dict, List, Optional
 from typing import Any, Dict, List, Optional
 
 
 import torch
 import torch
@@ -7,6 +6,7 @@ from torch.nn import Module
 from torch.nn.parameter import Parameter
 from torch.nn.parameter import Parameter
 
 
 from aphrodite import _custom_ops as ops
 from aphrodite import _custom_ops as ops
+from aphrodite import envs
 from aphrodite.common.utils import is_hip, print_warning_once
 from aphrodite.common.utils import is_hip, print_warning_once
 from aphrodite.modeling.layers.fused_moe import FusedMoE, FusedMoEMethodBase
 from aphrodite.modeling.layers.fused_moe import FusedMoE, FusedMoEMethodBase
 from aphrodite.modeling.layers.linear import (LinearBase, LinearMethodBase,
 from aphrodite.modeling.layers.linear import (LinearBase, LinearMethodBase,
@@ -26,8 +26,7 @@ from aphrodite.quantization.utils.w8a8_utils import (
     requantize_with_max_scale)
     requantize_with_max_scale)
 
 
 ACTIVATION_SCHEMES = ["static", "dynamic"]
 ACTIVATION_SCHEMES = ["static", "dynamic"]
-APHRODITE_TEST_FORCE_FP8_MARLIN = os.environ.get(
-    "APHRODITE_TEST_FORCE_FP8_MARLIN", "0").strip().lower() in ("1", "true")
+APHRODITE_TEST_FORCE_FP8_MARLIN = envs.APHRODITE_TEST_FORCE_FP8_MARLIN
 
 
 
 
 class Fp8Config(QuantizationConfig):
 class Fp8Config(QuantizationConfig):

+ 3 - 3
aphrodite/server/launch.py

@@ -1,5 +1,4 @@
 import asyncio
 import asyncio
-import os
 import signal
 import signal
 from http import HTTPStatus
 from http import HTTPStatus
 from typing import Any
 from typing import Any
@@ -8,12 +7,13 @@ import uvicorn
 from fastapi import FastAPI, Response
 from fastapi import FastAPI, Response
 from loguru import logger
 from loguru import logger
 
 
+from aphrodite import envs
 from aphrodite.common.utils import find_process_using_port, in_windows
 from aphrodite.common.utils import find_process_using_port, in_windows
 from aphrodite.engine.async_aphrodite import AsyncEngineDeadError
 from aphrodite.engine.async_aphrodite import AsyncEngineDeadError
 from aphrodite.engine.protocol import AsyncEngineClient
 from aphrodite.engine.protocol import AsyncEngineClient
 
 
-APHRODITE_KEEP_ALIVE_ON_ENGINE_DEATH = bool(os.getenv(
-    "APHRODITE_KEEP_ALIVE_ON_ENGINE_DEATH", 0))
+APHRODITE_KEEP_ALIVE_ON_ENGINE_DEATH = (
+    envs.APHRODITE_KEEP_ALIVE_ON_ENGINE_DEATH)
 
 
 
 
 async def serve_http(app: FastAPI, engine: AsyncEngineClient,
 async def serve_http(app: FastAPI, engine: AsyncEngineClient,

+ 2 - 3
aphrodite/task_handler/cpu_worker.py

@@ -1,10 +1,10 @@
 """A CPU worker class."""
 """A CPU worker class."""
-import os
 from typing import Dict, List, Optional, Tuple
 from typing import Dict, List, Optional, Tuple
 
 
 import torch
 import torch
 import torch.distributed
 import torch.distributed
 
 
+from aphrodite import envs
 from aphrodite.attention import get_attn_backend
 from aphrodite.attention import get_attn_backend
 from aphrodite.common.config import (CacheConfig, DeviceConfig, LoadConfig,
 from aphrodite.common.config import (CacheConfig, DeviceConfig, LoadConfig,
                                      LoRAConfig, ModelConfig, ParallelConfig,
                                      LoRAConfig, ModelConfig, ParallelConfig,
@@ -19,8 +19,7 @@ from aphrodite.task_handler.worker_base import (LocalOrDistributedWorkerBase,
                                                 LoraNotSupportedWorkerBase,
                                                 LoraNotSupportedWorkerBase,
                                                 WorkerInput)
                                                 WorkerInput)
 
 
-APHRODITE_CPU_OMP_THREADS_BIND = os.getenv("APHRODITE_CPU_OMP_THREADS_BIND",
-                                           "all")
+APHRODITE_CPU_OMP_THREADS_BIND = envs.APHRODITE_CPU_OMP_THREADS_BIND
 
 
 
 
 class CPUCacheEngine:
 class CPUCacheEngine:

+ 2 - 2
aphrodite/task_handler/tpu_worker.py

@@ -5,6 +5,7 @@ import torch
 import torch_xla.core.xla_model as xm
 import torch_xla.core.xla_model as xm
 import torch_xla.runtime as xr
 import torch_xla.runtime as xr
 
 
+from aphrodite import envs
 from aphrodite.common.config import (CacheConfig, DeviceConfig, LoadConfig,
 from aphrodite.common.config import (CacheConfig, DeviceConfig, LoadConfig,
                                      ModelConfig, ParallelConfig,
                                      ModelConfig, ParallelConfig,
                                      SchedulerConfig)
                                      SchedulerConfig)
@@ -99,8 +100,7 @@ class TPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
         # Use persistent cache to avoid XLA recompilation.
         # Use persistent cache to avoid XLA recompilation.
         # NOTE: Set per-rank cache path since different ranks
         # NOTE: Set per-rank cache path since different ranks
         # can have slightly different XLA graphs.
         # can have slightly different XLA graphs.
-        APHRODITE_XLA_CACHE_PATH = os.getenv("APHRODITE_XLA_CACHE_PATH",
-                                             "~/.aphrodite/xla_cache/")
+        APHRODITE_XLA_CACHE_PATH = envs.APHRODITE_XLA_CACHE_PATH
         world_size = self.parallel_config.world_size
         world_size = self.parallel_config.world_size
         per_rank_path = os.path.join(APHRODITE_XLA_CACHE_PATH,
         per_rank_path = os.path.join(APHRODITE_XLA_CACHE_PATH,
                                      f"tp{world_size}_rank{self.rank}")
                                      f"tp{world_size}_rank{self.rank}")

+ 2 - 2
aphrodite/transformers_utils/config.py

@@ -1,7 +1,6 @@
 import contextlib
 import contextlib
 import enum
 import enum
 import json
 import json
-import os
 from pathlib import Path
 from pathlib import Path
 from typing import Any, Dict, Optional, Type, Union
 from typing import Any, Dict, Optional, Type, Union
 
 
@@ -14,6 +13,7 @@ from transformers.models.auto.modeling_auto import (
     MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)
     MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)
 from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME
 from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME
 
 
+from aphrodite import envs
 from aphrodite.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
 from aphrodite.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
                                                   InternVLChatConfig,
                                                   InternVLChatConfig,
                                                   JAISConfig, MedusaConfig,
                                                   JAISConfig, MedusaConfig,
@@ -21,7 +21,7 @@ from aphrodite.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
                                                   MPTConfig, RWConfig)
                                                   MPTConfig, RWConfig)
 from aphrodite.transformers_utils.utils import check_gguf_file
 from aphrodite.transformers_utils.utils import check_gguf_file
 
 
-APHRODITE_USE_MODELSCOPE = os.getenv("APHRODITE_USE_MODELSCOPE", "0") == "1"
+APHRODITE_USE_MODELSCOPE = envs.APHRODITE_USE_MODELSCOPE
 
 
 if APHRODITE_USE_MODELSCOPE:
 if APHRODITE_USE_MODELSCOPE:
     from modelscope import AutoConfig
     from modelscope import AutoConfig

+ 4 - 5
examples/tensorize_aphrodite_model.py

@@ -1,10 +1,9 @@
 import argparse
 import argparse
 import dataclasses
 import dataclasses
 import json
 import json
-import os
 import uuid
 import uuid
 
 
-from aphrodite import LLM
+from aphrodite import LLM, envs
 from aphrodite.engine.args_tools import EngineArgs
 from aphrodite.engine.args_tools import EngineArgs
 from aphrodite.modeling.model_loader.tensorizer import (
 from aphrodite.modeling.model_loader.tensorizer import (
     TensorizerArgs, TensorizerConfig, tensorize_aphrodite_model)
     TensorizerArgs, TensorizerConfig, tensorize_aphrodite_model)
@@ -177,11 +176,11 @@ if __name__ == '__main__':
     args = parse_args()
     args = parse_args()
 
 
     s3_access_key_id = (getattr(args, 's3_access_key_id', None)
     s3_access_key_id = (getattr(args, 's3_access_key_id', None)
-                        or os.environ.get("S3_ACCESS_KEY_ID", None))
+                        or envs.S3_ACCESS_KEY_ID)
     s3_secret_access_key = (getattr(args, 's3_secret_access_key', None)
     s3_secret_access_key = (getattr(args, 's3_secret_access_key', None)
-                            or os.environ.get("S3_SECRET_ACCESS_KEY", None))
+                            or envs.S3_SECRET_ACCESS_KEY)
     s3_endpoint = (getattr(args, 's3_endpoint', None)
     s3_endpoint = (getattr(args, 's3_endpoint', None)
-                or os.environ.get("S3_ENDPOINT_URL", None))
+                or envs.S3_ENDPOINT_URL)
 
 
     credentials = {
     credentials = {
         "s3_access_key_id": s3_access_key_id,
         "s3_access_key_id": s3_access_key_id,

+ 22 - 7
setup.py

@@ -1,3 +1,4 @@
+import importlib.util
 import io
 import io
 import logging
 import logging
 import os
 import os
@@ -14,10 +15,16 @@ from setuptools import Extension, find_packages, setup
 from setuptools.command.build_ext import build_ext
 from setuptools.command.build_ext import build_ext
 from torch.utils.cpp_extension import CUDA_HOME
 from torch.utils.cpp_extension import CUDA_HOME
 
 
+
+def load_module_from_path(module_name, path):
+    spec = importlib.util.spec_from_file_location(module_name, path)
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)
+    return module
+
 ROOT_DIR = os.path.dirname(__file__)
 ROOT_DIR = os.path.dirname(__file__)
 logger = logging.getLogger(__name__)
 logger = logging.getLogger(__name__)
-# Target device of Aphrodite, supporting [cuda (by default), rocm, neuron, cpu]
-APHRODITE_TARGET_DEVICE = os.getenv("APHRODITE_TARGET_DEVICE", "cuda")
 
 
 
 
 def embed_commit_hash():
 def embed_commit_hash():
@@ -47,6 +54,14 @@ def embed_commit_hash():
 
 
 embed_commit_hash()
 embed_commit_hash()
 
 
+
+# cannot import envs directly because it depends on aphrodite,
+#  which is not installed yet
+envs = load_module_from_path('envs', os.path.join(
+    ROOT_DIR, 'aphrodite', 'envs.py'))
+
+APHRODITE_TARGET_DEVICE = envs.APHRODITE_TARGET_DEVICE
+
 if not sys.platform.startswith("linux"):
 if not sys.platform.startswith("linux"):
     logger.warning(
     logger.warning(
         "Aphrodite only supports Linux platform (including WSL). "
         "Aphrodite only supports Linux platform (including WSL). "
@@ -97,7 +112,7 @@ class cmake_build_ext(build_ext):
     def compute_num_jobs(self):
     def compute_num_jobs(self):
         # `num_jobs` is either the value of the MAX_JOBS environment variable
         # `num_jobs` is either the value of the MAX_JOBS environment variable
         # (if defined) or the number of CPUs available.
         # (if defined) or the number of CPUs available.
-        num_jobs = os.environ.get("MAX_JOBS", None)
+        num_jobs = envs.MAX_JOBS
         if num_jobs is not None:
         if num_jobs is not None:
             num_jobs = int(num_jobs)
             num_jobs = int(num_jobs)
             logger.info(f"Using MAX_JOBS={num_jobs} as the number of jobs.")
             logger.info(f"Using MAX_JOBS={num_jobs} as the number of jobs.")
@@ -118,7 +133,7 @@ class cmake_build_ext(build_ext):
             # environment variable (if defined) or 1.
             # environment variable (if defined) or 1.
             # when it is set, we reduce `num_jobs` to avoid
             # when it is set, we reduce `num_jobs` to avoid
             # overloading the system.
             # overloading the system.
-            nvcc_threads = os.getenv("NVCC_THREADS", None)
+            nvcc_threads = envs.NVCC_THREADS
             if nvcc_threads is not None:
             if nvcc_threads is not None:
                 nvcc_threads = int(nvcc_threads)
                 nvcc_threads = int(nvcc_threads)
                 logger.info(f"Using NVCC_THREADS={nvcc_threads} as the number"
                 logger.info(f"Using NVCC_THREADS={nvcc_threads} as the number"
@@ -143,7 +158,7 @@ class cmake_build_ext(build_ext):
         # Select the build type.
         # Select the build type.
         # Note: optimization level + debug info are set by the build type
         # Note: optimization level + debug info are set by the build type
         default_cfg = "Debug" if self.debug else "RelWithDebInfo"
         default_cfg = "Debug" if self.debug else "RelWithDebInfo"
-        cfg = os.getenv("CMAKE_BUILD_TYPE", default_cfg)
+        cfg = envs.CMAKE_BUILD_TYPE or default_cfg
 
 
         # where .so files will be written, should be the same for all extensions
         # where .so files will be written, should be the same for all extensions
         # that use the same CMakeLists.txt.
         # that use the same CMakeLists.txt.
@@ -161,7 +176,7 @@ class cmake_build_ext(build_ext):
             '-DAPHRODITE_TARGET_DEVICE={}'.format(APHRODITE_TARGET_DEVICE),
             '-DAPHRODITE_TARGET_DEVICE={}'.format(APHRODITE_TARGET_DEVICE),
         ]
         ]
 
 
-        verbose = bool(int(os.getenv('VERBOSE', '0')))
+        verbose = envs.VERBOSE
         if verbose:
         if verbose:
             cmake_args += ['-DCMAKE_VERBOSE_MAKEFILE=ON']
             cmake_args += ['-DCMAKE_VERBOSE_MAKEFILE=ON']
 
 
@@ -469,7 +484,7 @@ package_data = {
         "py.typed", "modeling/layers/fused_moe/configs/*.json"
         "py.typed", "modeling/layers/fused_moe/configs/*.json"
     ]
     ]
 }
 }
-if os.environ.get("APHRODITE_USE_PRECOMPILED"):
+if envs.APHRODITE_USE_PRECOMPILED:
     ext_modules = []
     ext_modules = []
     package_data["aphrodite"].append("*.so")
     package_data["aphrodite"].append("*.so")