interfaces.py 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127
  1. import enum
  2. from abc import ABC, abstractmethod
  3. from typing import List
  4. from typing import Sequence as GenericSequence
  5. from typing import Tuple
  6. from aphrodite.common.sequence import Sequence, SequenceGroup
  7. from aphrodite.common.utils import Device
  8. class AllocStatus(enum.Enum):
  9. """Result for BlockSpaceManager.can_allocate
  10. 1. Ok: seq_group can be allocated now.
  11. 2. Later: seq_group cannot be allocated.
  12. The capacity of allocator is larger than seq_group required.
  13. 3. Never: seq_group can never be allocated.
  14. The seq_group is too large to allocated in GPU.
  15. """
  16. OK = enum.auto()
  17. LATER = enum.auto()
  18. NEVER = enum.auto()
  19. class BlockSpaceManager(ABC):
  20. @staticmethod
  21. def get_block_space_manager_class(version: str):
  22. version = version.lower()
  23. if version == "v1":
  24. from aphrodite.processing.block_manager_v1 import (
  25. BlockSpaceManagerV1)
  26. return BlockSpaceManagerV1
  27. if version == "v2":
  28. from aphrodite.processing.block_manager_v2 import (
  29. BlockSpaceManagerV2)
  30. return BlockSpaceManagerV2
  31. if version == "placeholder":
  32. from aphrodite.processing.placeholder_block_space_manager import (
  33. PlaceholderBlockSpaceManager)
  34. return PlaceholderBlockSpaceManager
  35. raise ValueError(f"Unknown version {version=}")
  36. @abstractmethod
  37. def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
  38. pass
  39. @abstractmethod
  40. def allocate(self, seq_group: SequenceGroup) -> None:
  41. pass
  42. @abstractmethod
  43. def can_append_slots(self, seq_group: SequenceGroup,
  44. num_lookahead_slots: int) -> bool:
  45. pass
  46. @abstractmethod
  47. def append_slots(
  48. self,
  49. seq: Sequence,
  50. num_lookahead_slots: int,
  51. ) -> List[Tuple[int, int]]:
  52. pass
  53. @abstractmethod
  54. def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
  55. pass
  56. @abstractmethod
  57. def can_swap_in(self, seq_group: SequenceGroup,
  58. num_lookahead_slots: int) -> AllocStatus:
  59. pass
  60. @abstractmethod
  61. def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
  62. pass
  63. @abstractmethod
  64. def can_swap_out(self, seq_group: SequenceGroup) -> bool:
  65. pass
  66. @abstractmethod
  67. def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
  68. pass
  69. @abstractmethod
  70. def free(self, seq: Sequence) -> None:
  71. pass
  72. @abstractmethod
  73. def get_block_table(self, seq: Sequence) -> List[int]:
  74. pass
  75. @abstractmethod
  76. def get_num_free_gpu_blocks(self) -> int:
  77. pass
  78. @abstractmethod
  79. def get_num_free_cpu_blocks(self) -> int:
  80. pass
  81. @abstractmethod
  82. def access_all_blocks_in_seq(
  83. self,
  84. seq: Sequence,
  85. access_time: float,
  86. ) -> None:
  87. pass
  88. @abstractmethod
  89. def get_common_computed_block_ids(
  90. self, seqs: List[Sequence]) -> GenericSequence[int]:
  91. pass
  92. @abstractmethod
  93. def mark_blocks_as_computed(self, seq_group: SequenceGroup,
  94. token_chunk_size: int):
  95. pass
  96. @abstractmethod
  97. def get_prefix_cache_hit_rate(self, device: Device) -> float:
  98. """Prefix cache hit rate. -1 means not supported or disabled."""
  99. pass