medusa.py 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159
  1. from typing import Iterable, List, Optional, Tuple
  2. import torch
  3. import torch.nn as nn
  4. from aphrodite.common.sequence import SamplerOutput
  5. from aphrodite.modeling.layers.logits_processor import LogitsProcessor
  6. from aphrodite.modeling.layers.vocab_parallel_embedding import (
  7. DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead)
  8. from aphrodite.modeling.model_loader.weight_utils import default_weight_loader
  9. from aphrodite.modeling.sampling_metadata import SamplingMetadata
  10. from aphrodite.transformers_utils.configs.medusa import MedusaConfig
  11. class ResidualBlock(nn.Module):
  12. def __init__(self, hidden_size: int, num_layers: int) -> None:
  13. super().__init__()
  14. self.layers = nn.ModuleList([
  15. nn.Linear(hidden_size, hidden_size, bias=False)
  16. for _ in range(num_layers)
  17. ])
  18. self.act = nn.SiLU()
  19. def forward(self, x: torch.Tensor) -> torch.Tensor:
  20. for layer in self.layers:
  21. x = x + self.act(layer(x))
  22. return x
  23. class Medusa(nn.Module):
  24. def __init__(self, config: MedusaConfig, **_) -> None:
  25. super().__init__()
  26. self.config = config
  27. self.blocks = nn.ModuleList([
  28. ResidualBlock(hidden_size=self.config.hidden_size,
  29. num_layers=self.config.num_hidden_layers)
  30. for _ in range(self.config.num_heads)
  31. ])
  32. self.orig_vocab_size = config.vocab_size
  33. self.truncated_vocab_size = config.truncated_vocab_size
  34. self.unpadded_vocab_size = self.truncated_vocab_size
  35. self.lm_heads = nn.ModuleList([
  36. ParallelLMHead(
  37. self.unpadded_vocab_size,
  38. config.hidden_size,
  39. org_num_embeddings=self.truncated_vocab_size,
  40. padding_size=DEFAULT_VOCAB_PADDING_SIZE,
  41. ) for _ in range(self.config.num_heads)
  42. ])
  43. logit_scale = getattr(config, "logit_scale", 1.0)
  44. self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
  45. self.truncated_vocab_size,
  46. logit_scale)
  47. self.token_map = None
  48. def forward(self, hidden_states: torch.Tensor) -> List[torch.Tensor]:
  49. return [block(hidden_states) for block in self.blocks]
  50. def compute_logits(
  51. self, hidden_states: List[torch.Tensor],
  52. sampling_metadata: SamplingMetadata) -> List[torch.Tensor]:
  53. logits = []
  54. for hs, lm_head in zip(hidden_states, self.lm_heads):
  55. _logits = self.logits_processor(lm_head, hs, sampling_metadata)
  56. if self.token_map is None:
  57. logits.append(_logits)
  58. else:
  59. logits.append(-torch.inf * torch.ones(
  60. size=(*_logits.shape[:-1], self.orig_vocab_size),
  61. device=_logits.device,
  62. dtype=_logits.dtype))
  63. logits[-1][..., self.token_map] = _logits
  64. return logits
  65. def sample(
  66. self,
  67. logits: List[torch.Tensor],
  68. sampling_metadata: SamplingMetadata,
  69. ) -> List[SamplerOutput]:
  70. logits = torch.stack(logits, dim=0).float()
  71. logprobs = torch.log_softmax(logits, dim=-1)
  72. token_ids = logits.argmax(-1) # support only top-1 for now
  73. probs = torch.softmax(logits, dim=-1)
  74. token_id_list = []
  75. token_prob_list = []
  76. token_logprob_list = []
  77. for idx, seq_group in enumerate(sampling_metadata.seq_groups):
  78. token_id_list.append(token_ids[:, seq_group.sample_indices])
  79. token_prob_list.append(probs[:, seq_group.sample_indices])
  80. token_logprob_list.append(logprobs[:, seq_group.sample_indices])
  81. outputs: List[Optional[SamplerOutput]] = []
  82. for idx in range(len(sampling_metadata.seq_groups)):
  83. outputs.append(
  84. SamplerOutput(
  85. outputs=None,
  86. sampled_token_probs=token_prob_list[idx].squeeze(1),
  87. logprobs=token_logprob_list[idx].squeeze(1),
  88. sampled_token_ids=token_id_list[idx].squeeze(1),
  89. ))
  90. return outputs
  91. def generate_proposals(
  92. self,
  93. previous_hidden_states: torch.Tensor,
  94. sampling_metadata: SamplingMetadata,
  95. ) -> List[SamplerOutput]:
  96. return self.sample(
  97. logits=self.compute_logits(
  98. hidden_states=self.forward(previous_hidden_states),
  99. sampling_metadata=sampling_metadata,
  100. ),
  101. sampling_metadata=sampling_metadata,
  102. )
  103. def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
  104. params_dict = dict(self.named_parameters())
  105. weights_map = {}
  106. for name, loaded_weight in weights:
  107. name = name.replace("medusa_heads.", "")
  108. if name == "token_map":
  109. if self.truncated_vocab_size < self.orig_vocab_size:
  110. self.token_map = nn.Parameter(loaded_weight,
  111. requires_grad=False)
  112. elif name in params_dict:
  113. weights_map[name] = loaded_weight
  114. for name, loaded_weight in weights_map.items():
  115. if "lm_head" in name and self.token_map is not None and\
  116. loaded_weight.shape[0] > self.token_map.shape[0]:
  117. loaded_weight = loaded_weight[self.token_map]
  118. param = params_dict[name]
  119. weight_loader = getattr(param, "weight_loader",
  120. default_weight_loader)
  121. weight_loader(param, loaded_weight)
  122. if self.token_map is not None:
  123. self.token_map.to(device=self.lm_heads[0].weight.device)
  124. assert (self.truncated_vocab_size
  125. == self.orig_vocab_size) or (self.token_map is not None)