Ver Fonte

core: fix spec decode metrics and envs circular import (#889)

* feat: add torch profiler support

* revert torch profiler

* fix: move envs to common
AlpinDale há 2 meses atrás
pai
commit
22a4cd4595
47 ficheiros alterados com 48 adições e 50 exclusões
  1. 1 1
      aphrodite/assets/base.py
  2. 1 1
      aphrodite/attention/backends/rocm_flash_attn.py
  3. 1 1
      aphrodite/attention/selector.py
  4. 1 1
      aphrodite/common/config.py
  5. 0 0
      aphrodite/common/envs.py
  6. 1 1
      aphrodite/common/logger.py
  7. 1 1
      aphrodite/common/sampling_params.py
  8. 1 1
      aphrodite/common/sequence.py
  9. 1 1
      aphrodite/common/utils.py
  10. 1 1
      aphrodite/distributed/device_communicators/custom_all_reduce.py
  11. 1 1
      aphrodite/distributed/device_communicators/custom_all_reduce_utils.py
  12. 1 1
      aphrodite/distributed/device_communicators/shm_broadcast.py
  13. 1 1
      aphrodite/distributed/parallel_state.py
  14. 1 1
      aphrodite/distributed/utils.py
  15. 1 1
      aphrodite/endpoints/openai/api_server.py
  16. 1 1
      aphrodite/engine/aphrodite_engine.py
  17. 1 1
      aphrodite/engine/args_tools.py
  18. 1 1
      aphrodite/engine/async_aphrodite.py
  19. 1 1
      aphrodite/executor/cpu_executor.py
  20. 1 1
      aphrodite/executor/multiproc_worker_utils.py
  21. 1 1
      aphrodite/executor/openvino_executor.py
  22. 1 1
      aphrodite/executor/ray_gpu_executor.py
  23. 1 1
      aphrodite/executor/ray_tpu_executor.py
  24. 1 1
      aphrodite/executor/ray_xpu_executor.py
  25. 0 4
      aphrodite/modeling/__init__.py
  26. 1 1
      aphrodite/modeling/layers/fused_moe/fused_moe.py
  27. 1 1
      aphrodite/modeling/layers/sampler.py
  28. 1 1
      aphrodite/modeling/model_loader/openvino.py
  29. 1 1
      aphrodite/modeling/model_loader/tensorizer.py
  30. 1 1
      aphrodite/modeling/models/mlp_speculator.py
  31. 1 1
      aphrodite/multimodal/utils.py
  32. 1 1
      aphrodite/plugins/__init__.py
  33. 1 1
      aphrodite/quantization/fp8.py
  34. 1 1
      aphrodite/server/launch.py
  35. 1 1
      aphrodite/spec_decode/medusa_worker.py
  36. 1 1
      aphrodite/spec_decode/mlp_speculator_worker.py
  37. 1 1
      aphrodite/task_handler/cpu_model_runner.py
  38. 1 1
      aphrodite/task_handler/cpu_worker.py
  39. 1 1
      aphrodite/task_handler/enc_dec_model_runner.py
  40. 2 1
      aphrodite/task_handler/model_runner.py
  41. 2 2
      aphrodite/task_handler/model_runner_base.py
  42. 1 1
      aphrodite/task_handler/neuron_model_runner.py
  43. 1 1
      aphrodite/task_handler/openvino_model_runner.py
  44. 1 1
      aphrodite/task_handler/tpu_worker.py
  45. 1 1
      aphrodite/transformers_utils/config.py
  46. 2 1
      examples/tensorize_aphrodite_model.py
  47. 1 1
      tests/worker/test_model_input.py

+ 1 - 1
aphrodite/assets/base.py

@@ -5,7 +5,7 @@ from functools import lru_cache
 from pathlib import Path
 from typing import Optional
 
-from aphrodite import envs
+import aphrodite.common.envs as envs
 from aphrodite.connections import global_http_connection
 
 

+ 1 - 1
aphrodite/attention/backends/rocm_flash_attn.py

@@ -5,7 +5,7 @@ from typing import Any, Dict, List, Optional, Tuple, Type
 import torch
 from loguru import logger
 
-from aphrodite import envs
+import aphrodite.common.envs as envs
 from aphrodite.attention.backends.abstract import (AttentionBackend,
                                                    AttentionImpl,
                                                    AttentionMetadata,

+ 1 - 1
aphrodite/attention/selector.py

@@ -7,7 +7,7 @@ from typing import Generator, Optional, Type
 import torch
 from loguru import logger
 
-from aphrodite import envs
+import aphrodite.common.envs as envs
 from aphrodite.attention.backends.abstract import AttentionBackend
 from aphrodite.common.utils import (STR_BACKEND_ENV_VAR, is_cpu, is_hip,
                                     is_openvino, is_xpu)

+ 1 - 1
aphrodite/common/config.py

@@ -9,7 +9,7 @@ import torch
 from loguru import logger
 from transformers import PretrainedConfig
 
-from aphrodite import envs
+import aphrodite.common.envs as envs
 from aphrodite.common.utils import (STR_NOT_IMPL_ENC_DEC_CUDAGRAPH, GiB_bytes,
                                     cuda_device_count_stateless,
                                     get_cpu_memory, is_cpu, is_hip, is_neuron,

+ 0 - 0
aphrodite/envs.py → aphrodite/common/envs.py


+ 1 - 1
aphrodite/common/logger.py

@@ -15,7 +15,7 @@ from rich.markup import escape
 from rich.progress import (BarColumn, MofNCompleteColumn, Progress,
                            TaskProgressColumn, TextColumn, TimeRemainingColumn)
 
-from aphrodite import envs
+import aphrodite.common.envs as envs
 
 RICH_CONSOLE = Console()
 LOG_LEVEL = os.getenv("APHRODITE_LOG_LEVEL", "INFO").upper()

+ 1 - 1
aphrodite/common/sampling_params.py

@@ -9,7 +9,7 @@ import torch
 from loguru import logger
 from typing_extensions import Annotated
 
-from aphrodite import envs
+import aphrodite.common.envs as envs
 
 _SAMPLING_EPS = 1e-5
 _MAX_TEMP = 1e-2

+ 1 - 1
aphrodite/common/sequence.py

@@ -17,11 +17,11 @@ from aphrodite.constants import APHRODITE_TOKEN_ID_ARRAY_TYPE
 from aphrodite.inputs.parse import is_valid_encoder_decoder_llm_inputs
 from aphrodite.lora.request import LoRARequest
 from aphrodite.prompt_adapter.request import PromptAdapterRequest
+from aphrodite.spec_decode.metrics import SpecDecodeWorkerMetrics
 
 if TYPE_CHECKING:
     from aphrodite.inputs import LLMInputs
     from aphrodite.multimodal import MultiModalDataDict
-    from aphrodite.spec_decode.metrics import SpecDecodeWorkerMetrics
 
 
 @dataclass

+ 1 - 1
aphrodite/common/utils.py

@@ -31,7 +31,7 @@ from rich.progress import (BarColumn, MofNCompleteColumn, Progress,
                            SpinnerColumn, TextColumn, TimeElapsedColumn)
 from typing_extensions import ParamSpec, TypeIs, assert_never
 
-from aphrodite import envs
+import aphrodite.common.envs as envs
 from aphrodite.common.logger import enable_trace_function_call
 from aphrodite.distributed import get_tensor_model_parallel_rank
 

+ 1 - 1
aphrodite/distributed/device_communicators/custom_all_reduce.py

@@ -6,8 +6,8 @@ import torch.distributed as dist
 from loguru import logger
 from torch.distributed import ProcessGroup
 
+import aphrodite.common.envs as envs
 from aphrodite import _custom_ops as ops
-from aphrodite import envs
 from aphrodite.common.utils import cuda_device_count_stateless
 from aphrodite.distributed.device_communicators.custom_all_reduce_utils import (
     gpu_p2p_access_check)

+ 1 - 1
aphrodite/distributed/device_communicators/custom_all_reduce_utils.py

@@ -11,7 +11,7 @@ import torch.distributed as dist
 import torch.multiprocessing as mp
 from loguru import logger
 
-from aphrodite import envs
+import aphrodite.common.envs as envs
 from aphrodite.common.utils import (cuda_device_count_stateless,
                                     update_environment_variables)
 from aphrodite.distributed.device_communicators.cuda_wrapper import (

+ 1 - 1
aphrodite/distributed/device_communicators/shm_broadcast.py

@@ -12,7 +12,7 @@ from loguru import logger
 from torch.distributed import ProcessGroup
 from zmq import SUB, SUBSCRIBE, XPUB, XPUB_VERBOSE, Context  # type: ignore
 
-from aphrodite import envs
+import aphrodite.common.envs as envs
 from aphrodite.common.utils import get_ip, get_open_port
 
 APHRODITE_RINGBUFFER_WARNING_INTERVAL = (

+ 1 - 1
aphrodite/distributed/parallel_state.py

@@ -35,7 +35,7 @@ import torch.distributed
 from loguru import logger
 from torch.distributed import Backend, ProcessGroup
 
-from aphrodite import envs
+import aphrodite.common.envs as envs
 
 
 @dataclass

+ 1 - 1
aphrodite/distributed/utils.py

@@ -7,7 +7,7 @@ from typing import Sequence, Tuple
 
 import torch
 
-from aphrodite import envs
+import aphrodite.common.envs as envs
 
 APHRODITE_PP_LAYER_PARTITION = envs.APHRODITE_PP_LAYER_PARTITION
 

+ 1 - 1
aphrodite/endpoints/openai/api_server.py

@@ -20,7 +20,7 @@ from fastapi.responses import (HTMLResponse, JSONResponse, Response,
 from loguru import logger
 from starlette.routing import Mount
 
-from aphrodite import envs
+import aphrodite.common.envs as envs
 from aphrodite.common.config import ModelConfig
 from aphrodite.common.outputs import RequestOutput
 from aphrodite.common.sampling_params import _SAMPLING_EPS, SamplingParams

+ 1 - 1
aphrodite/engine/aphrodite_engine.py

@@ -8,7 +8,7 @@ from loguru import logger
 from transformers import PreTrainedTokenizer
 from typing_extensions import assert_never
 
-from aphrodite import envs
+import aphrodite.common.envs as envs
 from aphrodite.common.config import (CacheConfig, DecodingConfig, DeviceConfig,
                                      EngineConfig, LoadConfig, LoRAConfig,
                                      ModelConfig, ParallelConfig,

+ 1 - 1
aphrodite/engine/args_tools.py

@@ -7,7 +7,7 @@ from typing import (TYPE_CHECKING, Dict, List, Mapping, Optional, Tuple, Type,
 
 from loguru import logger
 
-from aphrodite import envs
+import aphrodite.common.envs as envs
 from aphrodite.common.config import (CacheConfig, ConfigFormat, DecodingConfig,
                                      DeviceConfig, EngineConfig, LoadConfig,
                                      LoadFormat, LoRAConfig, ModelConfig,

+ 1 - 1
aphrodite/engine/async_aphrodite.py

@@ -10,7 +10,7 @@ from loguru import logger
 from transformers import PreTrainedTokenizer
 from typing_extensions import assert_never
 
-from aphrodite import envs
+import aphrodite.common.envs as envs
 from aphrodite.common.config import (DecodingConfig, EngineConfig, LoRAConfig,
                                      ModelConfig, ParallelConfig,
                                      SchedulerConfig)

+ 1 - 1
aphrodite/executor/cpu_executor.py

@@ -5,7 +5,7 @@ from typing import Any, Awaitable, List, Optional, Set, Tuple, Union
 import torch
 from loguru import logger
 
-from aphrodite import envs
+import aphrodite.common.envs as envs
 from aphrodite.common.config import CacheConfig, ModelConfig, SchedulerConfig
 from aphrodite.common.sequence import ExecuteModelRequest, SamplerOutput
 from aphrodite.common.utils import (GiB_bytes, get_aphrodite_instance_id,

+ 1 - 1
aphrodite/executor/multiproc_worker_utils.py

@@ -14,7 +14,7 @@ from typing import (Any, Callable, Dict, Generic, List, Optional, TextIO,
 
 from loguru import logger
 
-from aphrodite import envs
+import aphrodite.common.envs as envs
 
 T = TypeVar('T')
 

+ 1 - 1
aphrodite/executor/openvino_executor.py

@@ -5,7 +5,7 @@ import openvino.properties.hint as hints
 import torch
 from loguru import logger
 
-from aphrodite import envs
+import aphrodite.common.envs as envs
 from aphrodite.common.config import CacheConfig, ModelConfig
 from aphrodite.common.sequence import ExecuteModelRequest, SamplerOutput
 from aphrodite.common.utils import (GiB_bytes, get_distributed_init_method,

+ 1 - 1
aphrodite/executor/ray_gpu_executor.py

@@ -7,7 +7,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
 import msgspec
 from loguru import logger
 
-from aphrodite import envs
+import aphrodite.common.envs as envs
 from aphrodite.common.sequence import ExecuteModelRequest, SamplerOutput
 from aphrodite.common.utils import (_run_task_with_lock,
                                     get_aphrodite_instance_id,

+ 1 - 1
aphrodite/executor/ray_tpu_executor.py

@@ -7,7 +7,7 @@ from typing import (TYPE_CHECKING, Any, Awaitable, Dict, List, Optional, Tuple,
 
 from loguru import logger
 
-from aphrodite import envs
+import aphrodite.common.envs as envs
 from aphrodite.common.sequence import ExecuteModelRequest, SamplerOutput
 from aphrodite.common.utils import (get_aphrodite_instance_id,
                                     get_distributed_init_method, get_ip,

+ 1 - 1
aphrodite/executor/ray_xpu_executor.py

@@ -7,7 +7,7 @@ from typing import (TYPE_CHECKING, Any, Awaitable, Dict, List, Optional, Set,
 
 from loguru import logger
 
-from aphrodite import envs
+import aphrodite.common.envs as envs
 from aphrodite.common.config import (CacheConfig, DeviceConfig, LoadConfig,
                                      LoRAConfig, ModelConfig, ParallelConfig,
                                      PromptAdapterConfig, SchedulerConfig,

+ 0 - 4
aphrodite/modeling/__init__.py

@@ -1,12 +1,8 @@
 from aphrodite.modeling.parameter import (BaseAphroditeParameter,
                                           PackedAphroditeParameter)
-from aphrodite.modeling.sampling_metadata import (SamplingMetadata,
-                                                  SamplingMetadataCache)
 from aphrodite.modeling.utils import set_random_seed
 
 __all__ = [
-    "SamplingMetadata",
-    "SamplingMetadataCache",
     "set_random_seed",
     "BaseAphroditeParameter",
     "PackedAphroditeParameter",

+ 1 - 1
aphrodite/modeling/layers/fused_moe/fused_moe.py

@@ -9,8 +9,8 @@ import triton
 import triton.language as tl
 from loguru import logger
 
+import aphrodite.common.envs as envs
 from aphrodite import _custom_ops as ops
-from aphrodite import envs
 from aphrodite.platforms import current_platform
 
 APHRODITE_FUSED_MOE_CHUNK_SIZE = envs.APHRODITE_FUSED_MOE_CHUNK_SIZE

+ 1 - 1
aphrodite/modeling/layers/sampler.py

@@ -10,7 +10,7 @@ import torch.nn as nn
 from loguru import logger
 
 import aphrodite._custom_ops as ops
-from aphrodite import envs
+import aphrodite.common.envs as envs
 from aphrodite.common.sampling_params import SamplingType
 from aphrodite.common.sequence import (CompletionSequenceGroupOutput, Logprob,
                                        PromptLogprobs, SampleLogprobs,

+ 1 - 1
aphrodite/modeling/model_loader/openvino.py

@@ -10,7 +10,7 @@ from openvino._offline_transformations import paged_attention_transformation
 from optimum.intel import OVModelForCausalLM
 from torch import nn
 
-from aphrodite import envs
+import aphrodite.common.envs as envs
 from aphrodite.attention.backends.openvino import OpenVINOAttentionMetadata
 from aphrodite.common.config import DeviceConfig, ModelConfig
 from aphrodite.common.sequence import SamplerOutput

+ 1 - 1
aphrodite/modeling/model_loader/tensorizer.py

@@ -13,7 +13,7 @@ from loguru import logger
 from torch import nn
 from transformers import PretrainedConfig
 
-from aphrodite import envs
+import aphrodite.common.envs as envs
 from aphrodite.common.config import ModelConfig, ParallelConfig
 from aphrodite.engine.aphrodite_engine import AphroditeEngine
 from aphrodite.engine.args_tools import EngineArgs

+ 1 - 1
aphrodite/modeling/models/mlp_speculator.py

@@ -5,12 +5,12 @@ import torch
 import torch.nn as nn
 
 from aphrodite.common.sequence import SamplerOutput
-from aphrodite.modeling import SamplingMetadata
 from aphrodite.modeling.layers.logits_processor import LogitsProcessor
 from aphrodite.modeling.layers.sampler import Sampler
 from aphrodite.modeling.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from aphrodite.modeling.model_loader.weight_utils import default_weight_loader
+from aphrodite.modeling.sampling_metadata import SamplingMetadata
 from aphrodite.transformers_utils.configs import MLPSpeculatorConfig
 
 SQRT2 = 2**0.5

+ 1 - 1
aphrodite/multimodal/utils.py

@@ -7,7 +7,7 @@ import numpy as np
 import soundfile
 from PIL import Image
 
-from aphrodite import envs
+import aphrodite.common.envs as envs
 from aphrodite.common.connections import global_http_connection
 from aphrodite.multimodal.base import MultiModalDataDict
 

+ 1 - 1
aphrodite/plugins/__init__.py

@@ -1,6 +1,6 @@
 from loguru import logger
 
-from aphrodite import envs
+import aphrodite.common.envs as envs
 
 
 def load_general_plugins():

+ 1 - 1
aphrodite/quantization/fp8.py

@@ -5,8 +5,8 @@ from loguru import logger
 from torch.nn import Module
 from torch.nn.parameter import Parameter
 
+import aphrodite.common.envs as envs
 from aphrodite import _custom_ops as ops
-from aphrodite import envs
 from aphrodite.common.utils import is_hip, print_warning_once
 from aphrodite.modeling.layers.fused_moe import FusedMoE, FusedMoEMethodBase
 from aphrodite.modeling.layers.linear import (LinearBase, LinearMethodBase,

+ 1 - 1
aphrodite/server/launch.py

@@ -7,7 +7,7 @@ import uvicorn
 from fastapi import FastAPI, Response
 from loguru import logger
 
-from aphrodite import envs
+import aphrodite.common.envs as envs
 from aphrodite.common.utils import find_process_using_port, in_windows
 from aphrodite.engine.async_aphrodite import AsyncEngineDeadError
 from aphrodite.engine.protocol import AsyncEngineClient

+ 1 - 1
aphrodite/spec_decode/medusa_worker.py

@@ -5,7 +5,7 @@ import torch
 
 from aphrodite.common.sequence import (ExecuteModelRequest, SamplerOutput,
                                        SequenceGroupMetadata)
-from aphrodite.modeling import SamplingMetadata
+from aphrodite.modeling.sampling_metadata import SamplingMetadata
 from aphrodite.spec_decode.interfaces import SpeculativeProposals
 from aphrodite.spec_decode.proposer_worker_base import NonLLMProposerWorkerBase
 from aphrodite.spec_decode.top1_proposer import Top1Proposer

+ 1 - 1
aphrodite/spec_decode/mlp_speculator_worker.py

@@ -4,7 +4,7 @@ import torch
 
 from aphrodite.common.sequence import (ExecuteModelRequest, SamplerOutput,
                                        SequenceGroupMetadata)
-from aphrodite.modeling import SamplingMetadata
+from aphrodite.modeling.sampling_metadata import SamplingMetadata
 from aphrodite.spec_decode.multi_step_worker import MultiStepWorker
 from aphrodite.spec_decode.proposer_worker_base import NonLLMProposerWorkerBase
 

+ 1 - 1
aphrodite/task_handler/cpu_model_runner.py

@@ -11,8 +11,8 @@ from aphrodite.common.config import (CacheConfig, DeviceConfig, LoadConfig,
 from aphrodite.common.sequence import (IntermediateTensors, SamplerOutput,
                                        SequenceGroupMetadata)
 from aphrodite.common.utils import make_tensor_with_pad
-from aphrodite.modeling import SamplingMetadata
 from aphrodite.modeling.model_loader import get_model
+from aphrodite.modeling.sampling_metadata import SamplingMetadata
 from aphrodite.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
                                   MultiModalInputs)
 from aphrodite.task_handler.model_runner_base import (

+ 1 - 1
aphrodite/task_handler/cpu_worker.py

@@ -4,7 +4,7 @@ from typing import Dict, List, Optional, Tuple
 import torch
 import torch.distributed
 
-from aphrodite import envs
+import aphrodite.common.envs as envs
 from aphrodite.attention import get_attn_backend
 from aphrodite.common.config import (CacheConfig, DeviceConfig, LoadConfig,
                                      LoRAConfig, ModelConfig, ParallelConfig,

+ 1 - 1
aphrodite/task_handler/enc_dec_model_runner.py

@@ -21,7 +21,7 @@ from aphrodite.common.sequence import (IntermediateTensors, PoolerOutput,
 from aphrodite.common.utils import (STR_NOT_IMPL_ENC_DEC_BACKEND,
                                     make_tensor_with_pad)
 from aphrodite.inputs import INPUT_REGISTRY, InputRegistry
-from aphrodite.modeling import SamplingMetadata
+from aphrodite.modeling.sampling_metadata import SamplingMetadata
 from aphrodite.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from aphrodite.task_handler.model_runner import (
     GPUModelRunnerBase, ModelInputForGPUBuilder,

+ 2 - 1
aphrodite/task_handler/model_runner.py

@@ -35,12 +35,13 @@ from aphrodite.inputs import INPUT_REGISTRY, InputRegistry
 from aphrodite.lora.layers import LoRAMapping
 from aphrodite.lora.request import LoRARequest
 from aphrodite.lora.worker_manager import LRUCacheWorkerLoRAManager
-from aphrodite.modeling import SamplingMetadata, SamplingMetadataCache
 from aphrodite.modeling.model_loader import get_model
 from aphrodite.modeling.model_loader.tensorizer import TensorizerConfig
 from aphrodite.modeling.models.interfaces import (supports_lora,
                                                   supports_multimodal)
 from aphrodite.modeling.models.utils import set_cpu_offload_max_bytes
+from aphrodite.modeling.sampling_metadata import (SamplingMetadata,
+                                                  SamplingMetadataCache)
 from aphrodite.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
                                   MultiModalInputs, MultiModalRegistry)
 from aphrodite.prompt_adapter.layers import PromptAdapterMapping

+ 2 - 2
aphrodite/task_handler/model_runner_base.py

@@ -12,7 +12,7 @@ from aphrodite.platforms import current_platform
 if TYPE_CHECKING:
     from aphrodite.attention import AttentionMetadata
     from aphrodite.attention.backends.abstract import AttentionBackend
-    from aphrodite.modeling import SamplingMetadata
+    from aphrodite.modeling.sampling_metadata import SamplingMetadata
 
 T = TypeVar('T', bound="BroadcastableModelInput")
 
@@ -54,7 +54,7 @@ def _init_sampling_metadata_from_tensor_dict(  # type: ignore
     Helper method to initialize SamplingMetadata based on broadcastable
     SamplingMetadata fields.
     """
-    from aphrodite.modeling import SamplingMetadata
+    from aphrodite.modeling.sampling_metadata import SamplingMetadata
 
     selected_token_indices = tensor_dict.pop("selected_token_indices", None)
     # An empty SamplingMetadata to signal that the worker should skip

+ 1 - 1
aphrodite/task_handler/neuron_model_runner.py

@@ -11,8 +11,8 @@ from aphrodite.common.sequence import (IntermediateTensors, SamplerOutput,
                                        SequenceGroupMetadata)
 from aphrodite.common.utils import (is_pin_memory_available,
                                     make_tensor_with_pad)
-from aphrodite.modeling import SamplingMetadata
 from aphrodite.modeling.model_loader.neuron import get_neuron_model
+from aphrodite.modeling.sampling_metadata import SamplingMetadata
 from aphrodite.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
                                   MultiModalInputs)
 from aphrodite.task_handler.model_runner_base import (ModelRunnerBase,

+ 1 - 1
aphrodite/task_handler/openvino_model_runner.py

@@ -10,8 +10,8 @@ from aphrodite.common.config import (CacheConfig, DeviceConfig, LoadConfig,
                                      LoRAConfig, ModelConfig, MultiModalConfig,
                                      ParallelConfig, SchedulerConfig)
 from aphrodite.common.sequence import SamplerOutput, SequenceGroupMetadata
-from aphrodite.modeling import SamplingMetadata
 from aphrodite.modeling.model_loader.openvino import get_model
+from aphrodite.modeling.sampling_metadata import SamplingMetadata
 from aphrodite.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
                                   MultiModalInputs)
 

+ 1 - 1
aphrodite/task_handler/tpu_worker.py

@@ -5,7 +5,7 @@ import torch
 import torch_xla.core.xla_model as xm
 import torch_xla.runtime as xr
 
-from aphrodite import envs
+import aphrodite.common.envs as envs
 from aphrodite.common.config import (CacheConfig, DeviceConfig, LoadConfig,
                                      ModelConfig, ParallelConfig,
                                      SchedulerConfig)

+ 1 - 1
aphrodite/transformers_utils/config.py

@@ -13,7 +13,7 @@ from transformers.models.auto.modeling_auto import (
     MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)
 from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME
 
-from aphrodite import envs
+import aphrodite.common.envs as envs
 from aphrodite.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
                                                   InternVLChatConfig,
                                                   JAISConfig, MedusaConfig,

+ 2 - 1
examples/tensorize_aphrodite_model.py

@@ -3,7 +3,8 @@ import dataclasses
 import json
 import uuid
 
-from aphrodite import LLM, envs
+import aphrodite.common.envs as envs
+from aphrodite import LLM
 from aphrodite.engine.args_tools import EngineArgs
 from aphrodite.modeling.model_loader.tensorizer import (
     TensorizerArgs, TensorizerConfig, tensorize_aphrodite_model)

+ 1 - 1
tests/worker/test_model_input.py

@@ -5,8 +5,8 @@ import torch
 
 from aphrodite.attention import AttentionMetadata, AttentionMetadataBuilder
 from aphrodite.attention.backends.abstract import AttentionBackend
-from aphrodite.modeling import SamplingMetadata
 from aphrodite.modeling.pooling_metadata import PoolingMetadata
+from aphrodite.modeling.sampling_metadata import SamplingMetadata
 from aphrodite.task_handler.embedding_model_runner import (
     ModelInputForGPUWithPoolingMetadata)
 from aphrodite.task_handler.model_runner import (