|
@@ -13,7 +13,7 @@ from aphrodite.modeling.megatron.parallel_state import (
|
|
|
from aphrodite.common.sampling_params import SamplingParams
|
|
|
from aphrodite.common.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
|
|
|
from aphrodite.task_handler.cache_engine import CacheEngine
|
|
|
-from aphrodite.common.utils import get_gpu_memory, get_max_shared_memory_bytes
|
|
|
+from aphrodite.common.utils import get_gpu_memory
|
|
|
|
|
|
|
|
|
class Worker:
|
|
@@ -141,14 +141,6 @@ class Worker:
|
|
|
self.block_size = cache_config.block_size
|
|
|
self.sliding_window = cache_config.sliding_window
|
|
|
|
|
|
- if self.sliding_window is None:
|
|
|
- max_seq_len = self.scheduler_config.max_model_len
|
|
|
- else:
|
|
|
- max_seq_len = min(self.scheduler_config.max_model_len,
|
|
|
- self.sliding_window)
|
|
|
-
|
|
|
- _check_if_can_support_max_seq_len(max_seq_len, self.block_size)
|
|
|
-
|
|
|
self.cache_engine = CacheEngine(self.cache_config, self.model_config,
|
|
|
self.parallel_config)
|
|
|
self.cache_events = self.cache_engine.events
|
|
@@ -385,26 +377,6 @@ def _pad_to_max(x: List[int], max_len: int, pad: int) -> List[int]:
|
|
|
return x + [pad] * (max_len - len(x))
|
|
|
|
|
|
|
|
|
-def _check_if_can_support_max_seq_len(max_seq_len: int,
|
|
|
- block_size: int) -> None:
|
|
|
- # Follows the logic in
|
|
|
- # attention_kernels.cu::single_query_cached_kv_attention_launcher
|
|
|
- max_shared_mem = get_max_shared_memory_bytes()
|
|
|
- float32_bytes = torch.finfo(torch.float).bits // 8
|
|
|
- padded_max_seq_len = (
|
|
|
- (max_seq_len + block_size - 1) / block_size) * block_size
|
|
|
- # padded_max_seq_len + extra buffer
|
|
|
- required_shared_mem = (padded_max_seq_len + 512) * float32_bytes
|
|
|
- if padded_max_seq_len * float32_bytes > max_shared_mem:
|
|
|
- raise RuntimeError(
|
|
|
- f"Aphrodite cannot currently support max_model_len={max_seq_len} "
|
|
|
- f"with block_size={block_size} on GPU with compute "
|
|
|
- f"capability {torch.cuda.get_device_capability()} "
|
|
|
- f"(required shared memory {required_shared_mem} > "
|
|
|
- f"available shared memory {max_shared_mem}). "
|
|
|
- "This will be fixed in a future release.")
|
|
|
-
|
|
|
-
|
|
|
def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
|
|
|
if torch_dtype == torch.bfloat16:
|
|
|
compute_capability = torch.cuda.get_device_capability()
|