smaller_tp_proposer_worker.py 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150
  1. from typing import List, Optional, Set, Tuple
  2. import torch
  3. from loguru import logger
  4. from aphrodite.common.sequence import ExecuteModelRequest, SamplerOutput
  5. from aphrodite.distributed.parallel_state import (get_tp_group,
  6. init_model_parallel_group,
  7. patch_tensor_parallel_group)
  8. from aphrodite.spec_decode.interfaces import SpeculativeProposals
  9. from aphrodite.spec_decode.multi_step_worker import MultiStepWorker
  10. from aphrodite.spec_decode.proposer_worker_base import ProposerWorkerBase
  11. class SmallerTpProposerWorker(ProposerWorkerBase):
  12. """Class which allows a speculative draft model to run with smaller tensor
  13. parallel degree than target model.
  14. This reduces the communication overhead of small draft models.
  15. To implement this feature, this class differs behavior based on is_dummy
  16. flag, where dummy means worker that does not participate draft generation.
  17. Participating workers use a smaller tp group by patching Aphrodite's tensor
  18. parallel group temporarily during forward passes of draft models.
  19. """
  20. @classmethod
  21. def maybe_wrap_worker(cls, worker, draft_tensor_parallel_size: int,
  22. target_tensor_parallel_size: int):
  23. """Wrap the worker in a SmallerTpProposerWorker if necessary.
  24. """
  25. if draft_tensor_parallel_size == target_tensor_parallel_size:
  26. return worker
  27. # gpu ranks that will generate draft tokens together
  28. draft_ranks = list(range(draft_tensor_parallel_size))
  29. logger.info(f"Wrapping {type(worker)} in {cls}")
  30. return cls(worker, draft_ranks)
  31. def __init__(self, worker: MultiStepWorker, draft_ranks: List[int]):
  32. """Create a SmallerTpProposerWorker.
  33. Args:
  34. worker (MultiStepWorker): an actual worker wrapped with this class
  35. draft_ranks (List[int]): if this value is given, only the GPU ranks
  36. written in this value participate in draft generation
  37. """
  38. self._worker = worker
  39. self._draft_ranks = draft_ranks
  40. # init during init_device
  41. self._is_dummy = False
  42. self._tp_group = None
  43. def _patch_tensor_parallel_group(self):
  44. """Temporarily patch the global tp group state with its own tp group
  45. state.
  46. """
  47. return patch_tensor_parallel_group(self._tp_group)
  48. def init_device(self) -> None:
  49. self._is_dummy = get_tp_group().rank not in self._draft_ranks
  50. # dummy workers do nothing
  51. if self._is_dummy:
  52. return
  53. # creates tp process group containing only a subset of gpu ranks
  54. local_rank = get_tp_group().local_rank
  55. tp_backend = torch.distributed.get_backend(get_tp_group().device_group)
  56. self._tp_group = init_model_parallel_group([self._draft_ranks],
  57. local_rank, tp_backend)
  58. with self._patch_tensor_parallel_group():
  59. self._worker.init_device()
  60. def set_include_gpu_probs_tensor(self) -> None:
  61. if self._is_dummy:
  62. return
  63. # Need include_gpu_probs_tensor for multi_step_worker
  64. self._worker.set_include_gpu_probs_tensor()
  65. def load_model(self) -> None:
  66. if self._is_dummy:
  67. return
  68. with self._patch_tensor_parallel_group():
  69. self._worker.load_model()
  70. def determine_num_available_blocks(self) -> Tuple[int, int]:
  71. if self._is_dummy:
  72. # this case is not used now
  73. return -1, -1
  74. with self._patch_tensor_parallel_group():
  75. return self._worker.determine_num_available_blocks()
  76. def initialize_cache(self, num_gpu_blocks: int,
  77. num_cpu_blocks: int) -> None:
  78. if self._is_dummy:
  79. return
  80. with self._patch_tensor_parallel_group():
  81. self._worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
  82. def sampler_output(
  83. self,
  84. execute_model_req: ExecuteModelRequest,
  85. sample_len: int,
  86. seq_ids_with_bonus_token_in_last_step: Set[int],
  87. ) -> Tuple[List[SamplerOutput], bool]:
  88. # Do not check _is_dummy, as it's always called by get_spec_proposals
  89. return self._worker.sampler_output(
  90. execute_model_req, sample_len,
  91. seq_ids_with_bonus_token_in_last_step)
  92. def get_spec_proposals(
  93. self,
  94. execute_model_req: ExecuteModelRequest,
  95. seq_ids_with_bonus_token_in_last_step: Set[int],
  96. ) -> SpeculativeProposals:
  97. """Produce speculations given an input batch of sequences. The number of
  98. speculative tokens per sequence is determined by max_proposal_len.
  99. """
  100. if self._is_dummy:
  101. return SpeculativeProposals(None, None, None)
  102. with self._patch_tensor_parallel_group():
  103. return self._worker.get_spec_proposals(
  104. execute_model_req, seq_ids_with_bonus_token_in_last_step)
  105. def execute_model(
  106. self,
  107. execute_model_req: Optional[ExecuteModelRequest] = None
  108. ) -> List[SamplerOutput]:
  109. if self._is_dummy:
  110. return []
  111. with self._patch_tensor_parallel_group():
  112. return self._worker.execute_model(execute_model_req)
  113. def get_cache_block_size_bytes(self) -> int:
  114. if self._is_dummy:
  115. # by returning zero, target worker can use the entire kv cache space
  116. return 0
  117. return self._worker.get_cache_block_size_bytes()
  118. @property
  119. def vocab_size(self) -> int:
  120. return self._worker.vocab_size