1 year ago · 358e247841
--- a/aphrodite/processing/block_manager.py
+++ b/aphrodite/processing/block_manager.py
@@ -0,0 +1,229 @@
 
				+"""A block manager that manages token blocks."""
			
 
				+from typing import Dict, List, Optional, Set, Tuple, Any
			
 
				+from aphrodite.block import PhysicalTokenBlock
			
 
				+from aphrodite.sequence import Sequence, SequenceGroup, SequenceStatus
			
 
				+from aphrodite.utils import Device
			
 
				+
			
 
				+class BlockAllocator:
			
 
				+    """Manages free physical token blocks for a device.
			
 
				+
			
 
				+    The allocator maintains a list of free blocks and allocates a block when
			
 
				+    requested. When a block is freed, its reference count is decremented. If
			
 
				+    the reference count becomes zero, the block is added back to the free list.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        device: Device,
			
 
				+        block_size: int,
			
 
				+        num_blocks: int,
			
 
				+    ) -> None:
			
 
				+        self.device = device
			
 
				+        self.block_size = block_size
			
 
				+        self.num_blocks = num_blocks
			
 
				+
			
 
				+        self.free_blocks: List[PhysicalTokenBlock] = []
			
 
				+        for i in range(num_blocks):
			
 
				+            block = PhysicalTokenBlock(
			
 
				+                device=device, block_number=1, block_size=block_size)
			
 
				+            self.free_blocks.append(block)
			
 
				+
			
 
				+    def allocate(self) -> PhysicalTokenBlock:
			
 
				+        if not self.free_blocks:
			
 
				+            raise ValueError("Out Of Memory! No free blocks are available.")
			
 
				+        block = self.free_blocks.pop()
			
 
				+        block.ref_count = 1
			
 
				+        return block
			
 
				+
			
 
				+    def free(self, block: PhysicalTokenBlock) -> None:
			
 
				+        if block.ref_count == 0:
			
 
				+            raise ValueError(f"Double free! {block} is already freed.")
			
 
				+        block.ref_count -= 1
			
 
				+        if block.ref_count == 0:
			
 
				+            self.free_blocks.append(block)
			
 
				+
			
 
				+    def get_num_free_blocks(self) -> int:
			
 
				+        return len(self.free_blocks)
			
 
				+
			
 
				+BlockTable = List[PhysicalTokenBlock]
			
 
				+
			
 
				+class BlockSpaceManager:
			
 
				+    """Manages the mapping between logical and physical blocks."""
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        block_size: int,
			
 
				+        num_gpu_blocks: int,
			
 
				+        num_cpu_blocks: int,
			
 
				+        watermark: float = 0.01,
			
 
				+    ) -> None:
			
 
				+        self.block_size = block_size
			
 
				+        self.num_total_gpu_blocks = num_gpu_blocks
			
 
				+        self.num_total_cpu_blocks = num_cpu_blocks
			
 
				+        self.watermark = watermark
			
 
				+        assert watermark >= 0.0
			
 
				+
			
 
				+        self.watermark_blocks = int(watermark * num_gpu_blocks)
			
 
				+        self.gpu_allocator = BlockAllocator(DEVICE.GPU, block_size, num_gpu_blocks)
			
 
				+        self.cpu_allocator = BlockAllocator(DEVICE.CPU, block_size, num_cpu_blocks)
			
 
				+
			
 
				+        self.block_tables: Dict[int, BlockTable] = {}
			
 
				+
			
 
				+    def can_allocate(self, seq_group: SequenceGroup) -> bool:
			
 
				+        """
			
 
				+        NOTE: we assume that all sequences in the group share the same prompt.
			
 
				+        This might not be true for preempted sequences. Needs fixing.
			
 
				+        """
			
 
				+        seq = seq_group.get_seqs()[0]
			
 
				+        num_required_blocks = len(seq.logical_token_blocks)
			
 
				+        num_free_gpu_blocks = self.gpu_allocator.get_num_free_blocks()
			
 
				+        return num_free_gpu_blocks - num_required_blocks >= self.watermark_blocks
			
 
				+
			
 
				+    def allocate(self, seq_group: SequenceGroup) -> None:
			
 
				+        seq = seq_group.get_seqs()[0]
			
 
				+
			
 
				+        block_table: BlockTable = []
			
 
				+        for _ in range(len(seq.logical_token_blocks)):
			
 
				+            block = self.gpu_allocator.allocate()
			
 
				+            block.ref_count = seq_group.num_seqs()
			
 
				+            block_table.append(block)
			
 
				+
			
 
				+        for seq in seq_group.get_seqs():
			
 
				+            self.block_tables[seq.seq_id] = block_table.copy()
			
 
				+
			
 
				+    def can_append_slot(self, seq_group: SequenceGroup) -> bool:
			
 
				+        """
			
 
				+        Simple heuristic: If there's at least one free block
			
 
				+        for each sequence, we can append.
			
 
				+        """
			
 
				+        num_free_gpu_blocks = self.gpu_allocator.get_num_free_blocks()
			
 
				+        num_seq = seq_group.num_seqs(status=SequenceStatus.RUNNING)
			
 
				+        return num_seqs <= num_free_gpu_blocks
			
 
				+
			
 
				+    def append_slot(self, req: Sequence) -> Optional[Tuple[int, int]]:
			
 
				+        """Allocate a physical slot for a new token"""
			
 
				+        logical_blocks = seq.logical_token_blocks
			
 
				+        block_table = self.block_tables[seq.seq_id]
			
 
				+
			
 
				+        if len(block_table) < len(logical_blocks):
			
 
				+            block = self.gpu_allocator.allocate()
			
 
				+            block_table.append(block)
			
 
				+            return None
			
 
				+
			
 
				+        last_block = block_table[-1]
			
 
				+        assert last_block.device == Device.GPU
			
 
				+        if last_block.ref_count == 1:
			
 
				+            return None
			
 
				+        else:
			
 
				+            new_block = self.gpu_allocator.allocate()
			
 
				+            block_table[-1] = new_block
			
 
				+            self.gpu_allocator.free(last_block)
			
 
				+            return last_block.block_number, new_block.block_number
			
 
				+    
			
 
				+    def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
			
 
				+        src_block_table = self.block_size[parent_seq.seq_id]
			
 
				+        self.block_tables[child_seq.seq_id] = src_block_table.copy()
			
 
				+        for block in src_block_table:
			
 
				+            block.ref_count += 1
			
 
				+
			
 
				+    def _get_physical_blocks(self, seq_group: SequenceGroup) -> List[PhysicalTokenBlock]:
			
 
				+        blocks: Set[PhysicalTokenBlock] = set()
			
 
				+        for seq in seq_group.get_seqs():
			
 
				+            if seq.is_finished():
			
 
				+                continue
			
 
				+            block_table = self.block_size[seq.seq_id]
			
 
				+            for block in block_table:
			
 
				+                blocks.add(block)
			
 
				+        return list(blocks)
			
 
				+
			
 
				+
			
 
				+    def can_swap_in(self, seq_group: SequenceGroup) -> bool:
			
 
				+        blocks = self._get_physical_blocks(seq_group)
			
 
				+        num_swapped_seqs = seq_group.num_seqs(status=SequenceStatus.SWAPPED)
			
 
				+        num_free_blocks = self.gpu_allocator.get_num_free_blocks()
			
 
				+        num_required_blocks = len(blocks) + num_swapped_seqs
			
 
				+        return num_free_blocks - num_free_blocks >= self.watermark_blocks
			
 
				+
			
 
				+    def swap_in(self, seq_group: SequenceGroup) -> Dict[int, int]:
			
 
				+        mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock] = {}
			
 
				+        for seq in seq_group.get_seqs():
			
 
				+            if seq.is_finished():
			
 
				+                continue
			
 
				+            new_block_table: BlockTable = []
			
 
				+            block_table = self.block_tables[seq.seq_id]
			
 
				+
			
 
				+            for cpu_block in block_table:
			
 
				+                if cpu_block in mapping:
			
 
				+                    gpu_block = mapping[cpu_block]
			
 
				+                    gpu_block.ref_count += 1
			
 
				+                else:
			
 
				+                    gpu_block = self.gpu_allocator.allocate()
			
 
				+                    mapping[cpu_block] = gpu_block
			
 
				+                new_block_table.append(gpu_block)
			
 
				+                self.cpu_allocator.free(cpu_block)
			
 
				+            self.block_tables[seq.seq_id] = new_block_table
			
 
				+
			
 
				+        block_number_mapping = {
			
 
				+            cpu_block.block_number: gpu_block.block_number
			
 
				+            for cpu_block, gpu_block in mapping.items()
			
 
				+        }
			
 
				+        return block_number_mapping
			
 
				+    
			
 
				+    def can_swap_out(self, seq_group: SequenceGroup) -> bool:
			
 
				+        blocks = self._get_physical_blocks(seq_group)
			
 
				+        return len(blocks) <= self.cpu_allocator.get_num_free_blocks()
			
 
				+
			
 
				+    def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]:
			
 
				+        # GPU block -> CPU block
			
 
				+        mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock] = {}
			
 
				+        for seq in seq_group.get_seqs():
			
 
				+            if seq.is_finished():
			
 
				+                continue
			
 
				+            new_block_table: BlockTable = []
			
 
				+            block_table = self.block_tables[seq.seq_id]
			
 
				+
			
 
				+            for gpu_block in block_table:
			
 
				+                if gpu_block in mapping:
			
 
				+                    cpu_block = mapping[gpu_block]
			
 
				+                    cpu_block.ref_count += 1
			
 
				+                else:
			
 
				+                    cpu_block = self.cpu_allocator.allocate
			
 
				+                    maping[gpu_block] = cpu_block
			
 
				+                new_block_table.append(cpu_block)
			
 
				+                self.gpu_allocator.free(gpu_block)
			
 
				+            self.block_tables[seq.seq_id] = new_block_table
			
 
				+
			
 
				+        block_table_mapping = {
			
 
				+            gpu_block.block_number: cpu_block.block_number
			
 
				+            for gpu_block, cpu_block in mapping.items()
			
 
				+        }
			
 
				+        return block_number_mapping
			
 
				+
			
 
				+    def _free_block_table(self, block_table: BlockTable) -> None:
			
 
				+        for block in block_table:
			
 
				+            if block.device == DEVICE.GPU:
			
 
				+                self.gpu_allocator.free(block)
			
 
				+            else:
			
 
				+                self.cpu_allocator.free(block)
			
 
				+
			
 
				+    def free(self, seq: Sequence) -> None:
			
 
				+        if seq.seq_id not in self.block_tables:
			
 
				+            return
			
 
				+        block_table = self.block_tables[seq.seq_id]
			
 
				+        self._free_block_table[block_table]
			
 
				+        del self.block_tables[seq.seq_id]
			
 
				+
			
 
				+    def reset(self) -> None:
			
 
				+        for block_table in self.block_tables.values():
			
 
				+            self._free_block_table(block_table)
			
 
				+        self.block_tables.clear()
			
 
				+
			
 
				+    def get_block_table(self, seq: Sequence) -> List[int]:
			
 
				+        block_table = self.block_tables[seq.seq_id]
			
 
				+        return [block.block_number for block in block_table]
			
 
				+
			
 
				+    def get_num_free_gpu_blocks(self) -> int:
			
 
				+        return self.gpu_allocator.get_num_free_blocks()
			
 
				+
			
 
				+    def get_num_free_cpu_blocks(self) -> int:
			
 
				+        return self.cpu_allocator.get_num_free_blocks()