cpu_gpu_block_allocator.py 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197
  1. from typing import Dict, List, Optional
  2. from aphrodite.processing.block.interfaces import (
  3. Block,
  4. BlockAllocator,
  5. DeviceAwareBlockAllocator,
  6. )
  7. from aphrodite.processing.block.naive_block import (
  8. NaiveBlock,
  9. NaiveBlockAllocator,
  10. )
  11. from aphrodite.processing.block.prefix_caching_block import (
  12. PrefixCachingBlockAllocator, )
  13. from aphrodite.common.utils import Device
  14. class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
  15. """A block allocator that can allocate blocks on both CPU and GPU memory.
  16. This class implements the `DeviceAwareBlockAllocator` interface and provides
  17. functionality for allocating and managing blocks of memory on both CPU and
  18. GPU devices.
  19. The `CpuGpuBlockAllocator` maintains separate memory pools for CPU and GPU
  20. blocks, and allows for allocation, deallocation, forking, and swapping of
  21. blocks across these memory pools.
  22. """
  23. @staticmethod
  24. def create(
  25. allocator_type: str,
  26. num_gpu_blocks: int,
  27. num_cpu_blocks: int,
  28. block_size: int,
  29. ) -> DeviceAwareBlockAllocator:
  30. """Creates a CpuGpuBlockAllocator instance with the specified
  31. configuration.
  32. This static method creates and returns a CpuGpuBlockAllocator instance
  33. based on the provided parameters. It initializes the CPU and GPU block
  34. allocators with the specified number of blocks, block size, and
  35. allocator type.
  36. Args:
  37. allocator_type (str): The type of block allocator to use for CPU
  38. and GPU blocks. Currently supported values are "naive" and
  39. "prefix_caching".
  40. num_gpu_blocks (int): The number of blocks to allocate for GPU
  41. memory.
  42. num_cpu_blocks (int): The number of blocks to allocate for CPU
  43. memory.
  44. block_size (int): The size of each block in number of tokens.
  45. Returns:
  46. DeviceAwareBlockAllocator: A CpuGpuBlockAllocator instance with the
  47. specified configuration.
  48. Notes:
  49. - The block IDs are assigned contiguously, with GPU block IDs coming
  50. before CPU block IDs.
  51. """
  52. block_ids = list(range(num_gpu_blocks + num_cpu_blocks))
  53. gpu_block_ids = block_ids[:num_gpu_blocks]
  54. cpu_block_ids = block_ids[num_gpu_blocks:]
  55. if allocator_type == "naive":
  56. gpu_allocator = NaiveBlockAllocator(
  57. create_block=NaiveBlock,
  58. num_blocks=num_gpu_blocks,
  59. block_size=block_size,
  60. block_ids=gpu_block_ids,
  61. )
  62. cpu_allocator = NaiveBlockAllocator(
  63. create_block=NaiveBlock,
  64. num_blocks=num_cpu_blocks,
  65. block_size=block_size,
  66. block_ids=cpu_block_ids,
  67. )
  68. elif allocator_type == "prefix_caching":
  69. gpu_allocator = PrefixCachingBlockAllocator(
  70. num_blocks=num_gpu_blocks,
  71. block_size=block_size,
  72. block_ids=gpu_block_ids,
  73. )
  74. cpu_allocator = PrefixCachingBlockAllocator(
  75. num_blocks=num_cpu_blocks,
  76. block_size=block_size,
  77. block_ids=cpu_block_ids,
  78. )
  79. else:
  80. raise ValueError(f"Unknown allocator type {allocator_type=}")
  81. return CpuGpuBlockAllocator(
  82. cpu_block_allocator=cpu_allocator,
  83. gpu_block_allocator=gpu_allocator,
  84. )
  85. def __init__(
  86. self,
  87. cpu_block_allocator: BlockAllocator,
  88. gpu_block_allocator: BlockAllocator,
  89. ):
  90. assert not (
  91. cpu_block_allocator.all_block_ids
  92. & gpu_block_allocator.all_block_ids
  93. ), "cpu and gpu block allocators can't have intersection of block ids"
  94. self._allocators = {
  95. Device.CPU: cpu_block_allocator,
  96. Device.GPU: gpu_block_allocator,
  97. }
  98. self._block_ids_to_allocator = {}
  99. for _, allocator in self._allocators.items():
  100. for block_id in allocator.all_block_ids:
  101. self._block_ids_to_allocator[block_id] = allocator
  102. def allocate_mutable(self, prev_block: Optional[Block],
  103. device: Device) -> Block:
  104. """Allocates a new mutable block on the specified device.
  105. Args:
  106. prev_block (Optional[Block]): The previous block to in the sequence.
  107. Used for prefix hashing.
  108. device (Device): The device on which to allocate the new block.
  109. Returns:
  110. Block: The newly allocated mutable block.
  111. """
  112. return self._allocators[device].allocate_mutable(prev_block)
  113. def allocate_immutable(self, prev_block: Optional[Block],
  114. token_ids: List[int], device: Device) -> Block:
  115. """Allocates a new immutable block with the provided token IDs on the
  116. specified device.
  117. Args:
  118. prev_block (Optional[Block]): The previous block in the sequence.
  119. Used for prefix hashing.
  120. token_ids (List[int]): The list of token IDs to be stored in the new
  121. block.
  122. device (Device): The device on which to allocate the new block.
  123. Returns:
  124. Block: The newly allocated immutable block containing the provided
  125. token IDs.
  126. """
  127. return self._allocators[device].allocate_immutable(
  128. prev_block, token_ids)
  129. def free(self, block: Block) -> None:
  130. """Frees the memory occupied by the given block.
  131. Args:
  132. block (Block): The block to be freed.
  133. """
  134. allocator = self._block_ids_to_allocator[block.block_id]
  135. return allocator.free(block)
  136. def fork(self, last_block: Block) -> List[Block]:
  137. """Creates a new sequence of blocks that shares the same underlying
  138. memory as the original sequence.
  139. Args:
  140. last_block (Block): The last block in the original sequence.
  141. Returns:
  142. List[Block]: A new list of blocks that shares the same memory as the
  143. original sequence.
  144. """
  145. allocator = self._block_ids_to_allocator[last_block.block_id]
  146. return allocator.fork(last_block)
  147. def get_num_free_blocks(self, device: Device) -> int:
  148. """Returns the number of free blocks available on the specified device.
  149. Args:
  150. device (Device): The device for which to query the number of free
  151. blocks.
  152. Returns:
  153. int: The number of free blocks available on the specified device.
  154. """
  155. return self._allocators[device].get_num_free_blocks()
  156. def clear_copy_on_writes(self) -> Dict[int, List[int]]:
  157. """Clears the copy-on-write (CoW) state and returns the mapping of
  158. source to destination block IDs.
  159. Returns:
  160. Dict[int, List[int]]: A dictionary mapping source block IDs to lists
  161. of destination block IDs.
  162. """
  163. # CoW only supported on GPU
  164. device = Device.GPU
  165. return self._allocators[device].clear_copy_on_writes()
  166. def mark_blocks_as_computed(self) -> None:
  167. # Prefix caching only supported on GPU.
  168. device = Device.GPU
  169. return self._allocators[device].mark_blocks_as_computed()
  170. def get_common_computed_block_ids(
  171. self, seq_block_ids: List[List[int]]) -> List[int]:
  172. # Prefix caching only supported on GPU.
  173. device = Device.GPU
  174. return self._allocators[device].get_common_computed_block_ids(
  175. seq_block_ids)
  176. def all_block_ids(self) -> frozenset[int]:
  177. return frozenset(self._block_ids_to_allocator.keys())