123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127 |
- import enum
- from abc import ABC, abstractmethod
- from typing import List
- from typing import Sequence as GenericSequence
- from typing import Tuple
- from aphrodite.common.sequence import Sequence, SequenceGroup
- from aphrodite.common.utils import Device
- class AllocStatus(enum.Enum):
- """Result for BlockSpaceManager.can_allocate
- 1. Ok: seq_group can be allocated now.
- 2. Later: seq_group cannot be allocated.
- The capacity of allocator is larger than seq_group required.
- 3. Never: seq_group can never be allocated.
- The seq_group is too large to allocated in GPU.
- """
- OK = enum.auto()
- LATER = enum.auto()
- NEVER = enum.auto()
- class BlockSpaceManager(ABC):
- @staticmethod
- def get_block_space_manager_class(version: str):
- version = version.lower()
- if version == "v1":
- from aphrodite.processing.block_manager_v1 import (
- BlockSpaceManagerV1)
- return BlockSpaceManagerV1
- if version == "v2":
- from aphrodite.processing.block_manager_v2 import (
- BlockSpaceManagerV2)
- return BlockSpaceManagerV2
- if version == "placeholder":
- from aphrodite.processing.placeholder_block_space_manager import (
- PlaceholderBlockSpaceManager)
- return PlaceholderBlockSpaceManager
- raise ValueError(f"Unknown version {version=}")
- @abstractmethod
- def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
- pass
- @abstractmethod
- def allocate(self, seq_group: SequenceGroup) -> None:
- pass
- @abstractmethod
- def can_append_slots(self, seq_group: SequenceGroup,
- num_lookahead_slots: int) -> bool:
- pass
- @abstractmethod
- def append_slots(
- self,
- seq: Sequence,
- num_lookahead_slots: int,
- ) -> List[Tuple[int, int]]:
- pass
- @abstractmethod
- def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
- pass
- @abstractmethod
- def can_swap_in(self, seq_group: SequenceGroup,
- num_lookahead_slots: int) -> AllocStatus:
- pass
- @abstractmethod
- def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
- pass
- @abstractmethod
- def can_swap_out(self, seq_group: SequenceGroup) -> bool:
- pass
- @abstractmethod
- def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
- pass
- @abstractmethod
- def free(self, seq: Sequence) -> None:
- pass
- @abstractmethod
- def get_block_table(self, seq: Sequence) -> List[int]:
- pass
- @abstractmethod
- def get_num_free_gpu_blocks(self) -> int:
- pass
- @abstractmethod
- def get_num_free_cpu_blocks(self) -> int:
- pass
- @abstractmethod
- def access_all_blocks_in_seq(
- self,
- seq: Sequence,
- access_time: float,
- ) -> None:
- pass
- @abstractmethod
- def get_common_computed_block_ids(
- self, seqs: List[Sequence]) -> GenericSequence[int]:
- pass
- @abstractmethod
- def mark_blocks_as_computed(self, seq_group: SequenceGroup,
- token_chunk_size: int):
- pass
- @abstractmethod
- def get_prefix_cache_hit_rate(self, device: Device) -> float:
- """Prefix cache hit rate. -1 means not supported or disabled."""
- pass
|