123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137 |
- import weakref
- from typing import List, Optional, Set, Tuple
- import torch
- from aphrodite.common.sequence import (ExecuteModelRequest,
- SequenceGroupMetadata)
- from aphrodite.modeling.layers.sampler import SamplerOutput
- from aphrodite.modeling.sampling_metadata import SamplingMetadata
- from aphrodite.spec_decode.interfaces import SpeculativeProposals
- from aphrodite.spec_decode.proposer_worker_base import NonLLMProposerWorkerBase
- from aphrodite.spec_decode.top1_proposer import Top1Proposer
- from aphrodite.worker.worker import Worker
- class MedusaWorker(NonLLMProposerWorkerBase, Worker):
- """Worker for Medusa.
- """
- def __init__(self, *args, **kwargs):
- super().__init__(*args, **kwargs)
- # Lazy initialization list.
- self._proposer: Top1Proposer
- def init_device(self):
- super().init_device()
- self._proposer = Top1Proposer(
- weakref.proxy(self), # type: ignore[arg-type]
- self.device,
- self.vocab_size,
- max_proposal_len=self.max_model_len,
- )
- def set_include_gpu_probs_tensor(self):
- pass
- def set_should_modify_greedy_probs_inplace(self):
- pass
- @torch.inference_mode()
- def sampler_output(
- self,
- execute_model_req: ExecuteModelRequest,
- sample_len: int,
- # Unused parameter.
- seq_ids_with_bonus_token_in_last_step: Set[int],
- ) -> Tuple[List[SamplerOutput], bool]:
- """Run the model forward pass to generate sample_len future tokens.
- Returns the list of sampler output, one per layer, along with indicator
- of whether torch tensor in sampler output need to be transposed in
- latter sampler_output_to_torch logic.
- For medusa worker, this indicator shall be False.
- """
- self._raise_if_unsupported(execute_model_req)
- seq_group_metadata_list = execute_model_req.seq_group_metadata_list
- seq_lens, query_lens = self._prepare_input_tensors(
- seq_group_metadata_list)
- generators = self.model_runner.get_generators(
- execute_model_req.finished_requests_ids)
- sampling_metadata = SamplingMetadata.prepare(
- seq_group_metadata_list, seq_lens, query_lens, self.device,
- self.model_runner.pin_memory, generators)
- model_outputs = self.model_runner.model.generate_proposals(
- previous_hidden_states=execute_model_req.previous_hidden_states.
- hidden_states,
- sampling_metadata=sampling_metadata)
- return model_outputs, False
- def _prepare_input_tensors(
- self,
- seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
- ) -> Tuple[List[int], List[int]]:
- if not seq_group_metadata_list:
- return [], []
- seq_lens: List[int] = []
- query_lens: List[int] = []
- for seq_group_metadata in seq_group_metadata_list:
- is_prompt = seq_group_metadata.is_prompt
- for seq_data in seq_group_metadata.seq_data.values():
- seq_data_len = seq_data.get_len()
- if is_prompt:
- context_len = seq_data.get_num_computed_tokens()
- seq_len = min(
- seq_data_len,
- context_len + seq_group_metadata.token_chunk_size)
- seq_lens.append(seq_len)
- query_lens.append(seq_len - context_len)
- else:
- seq_lens.append(seq_data_len)
- query_lens.append(1)
- return seq_lens, query_lens
- def get_spec_proposals(
- self,
- execute_model_req: ExecuteModelRequest,
- seq_ids_with_bonus_token_in_last_step: Set[int],
- ) -> SpeculativeProposals:
- """Produce speculations given an input batch of sequences. The number of
- speculative tokens per sequence is determined by max_proposal_len.
- """
- return self._proposer.get_spec_proposals(
- execute_model_req, seq_ids_with_bonus_token_in_last_step)
- def _raise_if_unsupported(
- self,
- execute_model_req: ExecuteModelRequest,
- ) -> None:
- """MedusaWorker does not yet implement support for cache swap
- operations or beam search.
- """
- if any([
- execute_model_req.blocks_to_swap_in,
- execute_model_req.blocks_to_swap_out,
- execute_model_req.blocks_to_copy
- ]):
- raise NotImplementedError(
- "MedusaWorker does not support cache operations")
- if any(
- len(seq_group_metadata.seq_data.keys()) != 1
- for seq_group_metadata in
- execute_model_req.seq_group_metadata_list):
- raise NotImplementedError(
- "MedusaWorker does not support beam search.")
|