multi_step.py 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171
  1. import functools
  2. from typing import Callable, List
  3. from transformers import PreTrainedTokenizer
  4. from aphrodite.common.logger import log_once
  5. from aphrodite.common.sampling_params import SamplingParams
  6. from aphrodite.common.sequence import (Sequence, SequenceGroup,
  7. SequenceGroupOutput, SequenceOutput,
  8. SequenceStatus)
  9. from aphrodite.common.utils import Counter
  10. from aphrodite.engine.output_processor.interfaces import (
  11. SequenceGroupOutputProcessor)
  12. from aphrodite.engine.output_processor.single_step import (
  13. single_step_process_prompt_logprob)
  14. from aphrodite.engine.output_processor.stop_checker import StopChecker
  15. from aphrodite.processing.scheduler import Scheduler
  16. from aphrodite.transformers_utils.detokenizer import Detokenizer
  17. class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
  18. """SequenceGroupOutputProcessor which handles logic related to
  19. detokenization and stopping conditions. It specializes to "multi-step
  20. decoding", where Aphrodite's worker may generate multiple tokens per
  21. invocation. This is currently mutually exclusive with advanced sampling
  22. techniques like beam search, which motivates the separation of this logic
  23. from the single step output processor.
  24. This class is responsible for things such as correctly appending all new
  25. token ids to their sequence, detokenizing new token ids, truncating new
  26. output tokens after an eos token, and correctly handling the case where the
  27. number of new output tokens per sequence differs in a single batch.
  28. """
  29. def __init__(
  30. self,
  31. detokenizer: Detokenizer,
  32. scheduler: List[Scheduler],
  33. seq_counter: Counter,
  34. get_tokenizer_for_seq: Callable[[Sequence], PreTrainedTokenizer],
  35. stop_checker: StopChecker,
  36. ):
  37. self.detokenizer = detokenizer
  38. self.scheduler = scheduler
  39. self.seq_counter = seq_counter
  40. self.get_tokenizer_for_seq = get_tokenizer_for_seq
  41. self.stop_checker = stop_checker
  42. def process_prompt_logprob(self, seq_group: SequenceGroup,
  43. outputs: List[SequenceGroupOutput]) -> None:
  44. """Process prompt logprobs associated with each step of a multi-step-
  45. scheduled computation.
  46. Args:
  47. seq_group: the outputs are associated with this :class:`SequenceGroup`
  48. outputs: the :class:`SequenceGroupOutput`s for all scheduler steps
  49. """
  50. for output in outputs:
  51. # Concatenate single-step prompt logprob processing results.
  52. single_step_process_prompt_logprob(self, seq_group, output)
  53. @staticmethod
  54. @functools.lru_cache()
  55. def _log_prompt_logprob_unsupported_warning_once():
  56. log_once(
  57. level="WARNING",
  58. message="Prompt logprob is not supported by multi step workers. "
  59. "(e.g., speculative decode uses multi step workers).")
  60. def process_outputs(self,
  61. sequence_group: SequenceGroup,
  62. outputs: List[SequenceGroupOutput],
  63. is_async: bool = False) -> None:
  64. """Append new tokens in the outputs to sequences in the sequence group.
  65. This only supports sequence groups of size 1. It supports greater than
  66. one new token per sequence.
  67. This applies logic like stop condition checking and detokenization.
  68. It also handles cases where there are tokens emitted after
  69. the EOS token.
  70. is_async - Indicates whether this postprocessor runs in
  71. parallel with the GPU forward pass and is processing
  72. tokens from the previous step. If this is true, then
  73. no tokens need to be appended since it is already done
  74. externally (before the next schedule() call)
  75. """
  76. # Sequences can be in RUNNING or FINISHED_ABORTED state
  77. # once scheduled, as a sequence is moved to FINSIHED_ABORTED
  78. # if a client disconnects from the api server.
  79. seqs = sequence_group.get_seqs(status=SequenceStatus.RUNNING)
  80. if seqs is None:
  81. seqs = sequence_group.get_seqs(
  82. status=SequenceStatus.FINISHED_ABORTED)
  83. assert seqs, "Expected RUNNING or FINISHED_ABORTED sequences"
  84. assert len(seqs) == 1, (
  85. "Beam search not supported in multi-step decoding.")
  86. seq = seqs[0]
  87. if is_async:
  88. # Async case: We process tokens one by one. Here, we know the token
  89. # was already appended, so we only need to do the rest of the
  90. # postprocessor: Detokenization + stopping logic
  91. self._process_decode_and_stop(seq, sequence_group.sampling_params)
  92. else:
  93. # Standard multi-step case
  94. # Since there's only one sequence per sequence group,
  95. # we can take the first sample.
  96. samples = [output.samples[0] for output in outputs]
  97. # -1 means the output token is not valid (eg. due to spec decode
  98. # rejecting tokens).
  99. valid_samples = [
  100. sample for sample in samples if sample.output_token != -1
  101. ]
  102. assert valid_samples
  103. self._process_seq_outputs(seq, valid_samples,
  104. sequence_group.sampling_params)
  105. def _process_decode_and_stop(self, seq: Sequence,
  106. sampling_params: SamplingParams) -> None:
  107. new_char_count = 0
  108. if sampling_params.detokenize:
  109. new_char_count = self.detokenizer.decode_sequence_inplace(
  110. seq, sampling_params)
  111. # TODO(sang): Support lora.
  112. self.stop_checker.maybe_stop_sequence(
  113. seq,
  114. new_char_count=new_char_count,
  115. sampling_params=sampling_params,
  116. )
  117. def _process_seq_outputs(self, seq: Sequence,
  118. valid_samples: List[SequenceOutput],
  119. sampling_params: SamplingParams) -> None:
  120. output_token_ids = [sample.output_token for sample in valid_samples]
  121. output_logprobs = [sample.logprobs for sample in valid_samples]
  122. # Truncate to max_tokens if necessary.
  123. remaining_tokens = sampling_params.max_tokens - (seq.get_output_len() +
  124. len(output_token_ids))
  125. if remaining_tokens < 0:
  126. valid_samples = valid_samples[:remaining_tokens]
  127. output_token_ids = output_token_ids[:remaining_tokens]
  128. # Truncate any tokens after EOS. This is required as spec decode
  129. # generates a fixed number of tokens without evaluating stopping
  130. # conditions within the block. This can cause an eos token to be
  131. # unintentionally ignored.
  132. if not sampling_params.ignore_eos:
  133. eos_token_id = self.get_tokenizer_for_seq(seq).eos_token_id
  134. # Avoiding .index calls as exception throwing in the happy path
  135. # is expensive.
  136. for i in range(len(output_token_ids)):
  137. if output_token_ids[i] == eos_token_id:
  138. output_token_ids = output_token_ids[:i + 1]
  139. valid_samples = valid_samples[:i + 1]
  140. break
  141. # Incrementally append tokens to the sequence, as if we had only one new
  142. # token.
  143. for output_token_id, output_logprob in zip(output_token_ids,
  144. output_logprobs):
  145. seq.append_token_id(
  146. token_id=output_token_id,
  147. logprobs=output_logprob,
  148. )
  149. self._process_decode_and_stop(seq, sampling_params)
  150. if seq.is_finished():
  151. break