medusa.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
  1. from typing import Iterable, List, Optional, Tuple
  2. import torch
  3. import torch.nn as nn
  4. from aphrodite.common.sequence import SamplerOutput
  5. from aphrodite.modeling.layers.logits_processor import LogitsProcessor
  6. from aphrodite.modeling.layers.vocab_parallel_embedding import (
  7. DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead)
  8. from aphrodite.modeling.model_loader.weight_utils import default_weight_loader
  9. from aphrodite.modeling.sampling_metadata import SamplingMetadata
  10. from aphrodite.transformers_utils.configs.medusa import MedusaConfig
  11. class ResidualBlock(nn.Module):
  12. def __init__(self, hidden_size: int, num_layers: int) -> None:
  13. super().__init__()
  14. self.layers = nn.ModuleList([
  15. nn.Linear(hidden_size, hidden_size, bias=False)
  16. for _ in range(num_layers)
  17. ])
  18. self.act = nn.SiLU()
  19. def forward(self, x: torch.Tensor) -> torch.Tensor:
  20. for layer in self.layers:
  21. x = x + self.act(layer(x))
  22. return x
  23. class Medusa(nn.Module):
  24. def __init__(self, config: MedusaConfig, **_) -> None:
  25. super().__init__()
  26. self.config = config
  27. self.blocks = nn.ModuleList([
  28. ResidualBlock(hidden_size=self.config.hidden_size,
  29. num_layers=self.config.num_hidden_layers)
  30. for _ in range(self.config.num_heads)
  31. ])
  32. self.orig_vocab_size = config.vocab_size
  33. self.truncated_vocab_size = config.truncated_vocab_size
  34. self.unpadded_vocab_size = self.truncated_vocab_size
  35. self.lm_heads = nn.ModuleList([
  36. ParallelLMHead(
  37. self.unpadded_vocab_size,
  38. config.hidden_size,
  39. org_num_embeddings=self.truncated_vocab_size,
  40. padding_size=DEFAULT_VOCAB_PADDING_SIZE,
  41. ) for _ in range(self.config.num_heads)
  42. ])
  43. logit_scale = getattr(config, "logit_scale", 1.0)
  44. self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
  45. self.truncated_vocab_size,
  46. logit_scale)
  47. self.token_map = None
  48. def forward(self, hidden_states: torch.Tensor) -> List[torch.Tensor]:
  49. return [block(hidden_states) for block in self.blocks]
  50. def compute_logits(
  51. self, hidden_states: List[torch.Tensor],
  52. sampling_metadata: SamplingMetadata) -> List[torch.Tensor]:
  53. logits_lst: List[torch.Tensor] = []
  54. for hs, lm_head in zip(hidden_states, self.lm_heads):
  55. _logits = self.logits_processor(lm_head, hs, sampling_metadata)
  56. if _logits is None:
  57. # _logits should only be None on rank > 0, in which case
  58. # it should remain true for every lm_head
  59. assert len(logits_lst) == 0
  60. continue
  61. if self.token_map is None:
  62. logits_lst.append(_logits)
  63. else:
  64. logits_lst.append(-torch.inf * torch.ones(
  65. size=(*_logits.shape[:-1], self.orig_vocab_size),
  66. device=_logits.device,
  67. dtype=_logits.dtype))
  68. logits_lst[-1][..., self.token_map] = _logits
  69. return logits_lst
  70. def sample(
  71. self,
  72. logits: List[torch.Tensor],
  73. sampling_metadata: SamplingMetadata,
  74. ) -> List[SamplerOutput]:
  75. logits = torch.stack(logits, dim=0).float()
  76. logprobs = torch.log_softmax(logits, dim=-1)
  77. token_ids = logits.argmax(-1) # support only top-1 for now
  78. probs = torch.softmax(logits, dim=-1)
  79. token_id_list = []
  80. token_prob_list = []
  81. token_logprob_list = []
  82. for idx, seq_group in enumerate(sampling_metadata.seq_groups):
  83. token_id_list.append(token_ids[:, seq_group.sample_indices])
  84. token_prob_list.append(probs[:, seq_group.sample_indices])
  85. token_logprob_list.append(logprobs[:, seq_group.sample_indices])
  86. outputs: List[Optional[SamplerOutput]] = []
  87. for idx in range(len(sampling_metadata.seq_groups)):
  88. outputs.append(
  89. SamplerOutput(
  90. outputs=None,
  91. sampled_token_probs=token_prob_list[idx].squeeze(1),
  92. logprobs=token_logprob_list[idx].squeeze(1),
  93. sampled_token_ids=token_id_list[idx].squeeze(1),
  94. ))
  95. return outputs
  96. def generate_proposals(
  97. self,
  98. previous_hidden_states: torch.Tensor,
  99. sampling_metadata: SamplingMetadata,
  100. ) -> List[SamplerOutput]:
  101. return self.sample(
  102. logits=self.compute_logits(
  103. hidden_states=self.forward(previous_hidden_states),
  104. sampling_metadata=sampling_metadata,
  105. ),
  106. sampling_metadata=sampling_metadata,
  107. )
  108. def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
  109. params_dict = dict(self.named_parameters())
  110. weights_map = {}
  111. for name, loaded_weight in weights:
  112. name = name.replace("medusa_heads.", "")
  113. if name == "token_map":
  114. if self.truncated_vocab_size < self.orig_vocab_size:
  115. self.token_map = nn.Parameter(loaded_weight,
  116. requires_grad=False)
  117. elif name in params_dict:
  118. weights_map[name] = loaded_weight
  119. for name, loaded_weight in weights_map.items():
  120. if "lm_head" in name and self.token_map is not None and\
  121. loaded_weight.shape[0] > self.token_map.shape[0]:
  122. loaded_weight = loaded_weight[self.token_map]
  123. param = params_dict[name]
  124. weight_loader = getattr(param, "weight_loader",
  125. default_weight_loader)
  126. weight_loader(param, loaded_weight)
  127. if self.token_map is not None:
  128. self.token_map.to(device=self.lm_heads[0].weight.device)
  129. assert (self.truncated_vocab_size
  130. == self.orig_vocab_size) or (self.token_map is not None)