mixtral_quant.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413
  1. # coding=utf-8
  2. # Adapted from
  3. # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
  4. # Copyright 2023 The vLLM team.
  5. # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
  6. #
  7. # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
  8. # and OPT implementations in this library. It has been modified from its
  9. # original forms to accommodate minor architectural differences compared
  10. # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
  11. #
  12. # Licensed under the Apache License, Version 2.0 (the "License");
  13. # you may not use this file except in compliance with the License.
  14. # You may obtain a copy of the License at
  15. #
  16. # http://www.apache.org/licenses/LICENSE-2.0
  17. #
  18. # Unless required by applicable law or agreed to in writing, software
  19. # distributed under the License is distributed on an "AS IS" BASIS,
  20. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  21. # See the License for the specific language governing permissions and
  22. # limitations under the License.
  23. """Inference-only Mixtral model."""
  24. from typing import Iterable, List, Optional, Tuple
  25. import numpy as np
  26. import torch
  27. import torch.nn.functional as F
  28. from torch import nn
  29. from transformers import MixtralConfig
  30. from aphrodite.attention import Attention, AttentionMetadata
  31. from aphrodite.common.config import CacheConfig
  32. from aphrodite.common.sequence import SamplerOutput
  33. from aphrodite.distributed import (get_tensor_model_parallel_rank,
  34. get_tensor_model_parallel_world_size,
  35. tensor_model_parallel_all_reduce)
  36. from aphrodite.modeling.layers.layernorm import RMSNorm
  37. from aphrodite.modeling.layers.linear import (QKVParallelLinear,
  38. ReplicatedLinear,
  39. RowParallelLinear)
  40. from aphrodite.modeling.layers.logits_processor import LogitsProcessor
  41. from aphrodite.modeling.layers.rotary_embedding import get_rope
  42. from aphrodite.modeling.layers.sampler import Sampler
  43. from aphrodite.modeling.layers.vocab_parallel_embedding import (
  44. ParallelLMHead, VocabParallelEmbedding)
  45. from aphrodite.modeling.model_loader.weight_utils import default_weight_loader
  46. from aphrodite.modeling.sampling_metadata import SamplingMetadata
  47. from aphrodite.quantization.base_config import QuantizationConfig
  48. class MixtralMLP(nn.Module):
  49. def __init__(
  50. self,
  51. num_experts: int,
  52. hidden_size: int,
  53. intermediate_size: int,
  54. quant_config: Optional[QuantizationConfig] = None,
  55. ) -> None:
  56. super().__init__()
  57. self.num_experts = num_experts
  58. self.ffn_dim = intermediate_size
  59. self.hidden_dim = hidden_size
  60. self.w1 = ReplicatedLinear(self.hidden_dim,
  61. self.ffn_dim,
  62. bias=False,
  63. quant_config=quant_config)
  64. self.w2 = ReplicatedLinear(self.ffn_dim,
  65. self.hidden_dim,
  66. bias=False,
  67. quant_config=quant_config)
  68. self.w3 = ReplicatedLinear(self.hidden_dim,
  69. self.ffn_dim,
  70. bias=False,
  71. quant_config=quant_config)
  72. # TODO: Use vllm's SiluAndMul
  73. self.act_fn = nn.SiLU()
  74. def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
  75. w1_out, _ = self.w1(hidden_states)
  76. w1_out = self.act_fn(w1_out)
  77. w3_out, _ = self.w3(hidden_states)
  78. current_hidden_states = w1_out * w3_out
  79. current_hidden_states, _ = self.w2(current_hidden_states)
  80. return current_hidden_states
  81. class MixtralMoE(nn.Module):
  82. def __init__(
  83. self,
  84. config: MixtralConfig,
  85. quant_config: Optional[QuantizationConfig] = None,
  86. ):
  87. super().__init__()
  88. self.config = config
  89. self.rank = get_tensor_model_parallel_rank()
  90. self.tp_size = get_tensor_model_parallel_world_size()
  91. self.num_total_experts = config.num_local_experts
  92. self.top_k = config.num_experts_per_tok
  93. if self.tp_size > self.num_total_experts:
  94. raise ValueError(
  95. f"Tensor parallel size {self.tp_size} is greater than "
  96. f"the number of experts {self.num_total_experts}.")
  97. # Split experts equally between ranks
  98. self.expert_indicies = np.array_split(range(
  99. self.num_total_experts), self.tp_size)[self.rank].tolist()
  100. if not self.expert_indicies:
  101. raise ValueError(
  102. f"Rank {self.rank} has no experts assigned to it.")
  103. self.experts = nn.ModuleList([
  104. MixtralMLP(self.num_total_experts,
  105. config.hidden_size,
  106. config.intermediate_size,
  107. quant_config=quant_config)
  108. if idx in self.expert_indicies else None
  109. for idx in range(self.num_total_experts)
  110. ])
  111. self.gate = ReplicatedLinear(config.hidden_size,
  112. self.num_total_experts,
  113. bias=False,
  114. quant_config=None)
  115. def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
  116. num_tokens, hidden_dim = hidden_states.shape
  117. hidden_states = hidden_states.view(-1, hidden_dim)
  118. # router_logits: (num_tokens, n_experts)
  119. router_logits, _ = self.gate(hidden_states)
  120. routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
  121. routing_weights, selected_experts = torch.topk(routing_weights,
  122. self.top_k,
  123. dim=-1)
  124. routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
  125. final_hidden_states = None
  126. for expert_idx in self.expert_indicies:
  127. expert_layer = self.experts[expert_idx]
  128. expert_mask = (selected_experts == expert_idx)
  129. expert_weights = (routing_weights * expert_mask).sum(dim=-1,
  130. keepdim=True)
  131. current_hidden_states = expert_layer(hidden_states).mul_(
  132. expert_weights)
  133. if final_hidden_states is None:
  134. final_hidden_states = current_hidden_states
  135. else:
  136. final_hidden_states.add_(current_hidden_states)
  137. return tensor_model_parallel_all_reduce(final_hidden_states).view(
  138. num_tokens, hidden_dim)
  139. class MixtralAttention(nn.Module):
  140. def __init__(
  141. self,
  142. hidden_size: int,
  143. num_heads: int,
  144. num_kv_heads: int,
  145. max_position: int = 4096 * 32,
  146. rope_theta: float = 10000,
  147. quant_config: Optional[QuantizationConfig] = None,
  148. sliding_window: Optional[int] = None,
  149. cache_config: Optional[CacheConfig] = None,
  150. ) -> None:
  151. super().__init__()
  152. self.hidden_size = hidden_size
  153. tp_size = get_tensor_model_parallel_world_size()
  154. self.total_num_heads = num_heads
  155. assert self.total_num_heads % tp_size == 0
  156. self.num_heads = self.total_num_heads // tp_size
  157. self.total_num_kv_heads = num_kv_heads
  158. if self.total_num_kv_heads >= tp_size:
  159. # Number of KV heads is greater than TP size, so we partition
  160. # the KV heads across multiple tensor parallel GPUs.
  161. assert self.total_num_kv_heads % tp_size == 0
  162. else:
  163. # Number of KV heads is less than TP size, so we replicate
  164. # the KV heads across multiple tensor parallel GPUs.
  165. assert tp_size % self.total_num_kv_heads == 0
  166. self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
  167. self.head_dim = hidden_size // self.total_num_heads
  168. self.q_size = self.num_heads * self.head_dim
  169. self.kv_size = self.num_kv_heads * self.head_dim
  170. self.scaling = self.head_dim**-0.5
  171. self.rope_theta = rope_theta
  172. self.sliding_window = sliding_window
  173. self.qkv_proj = QKVParallelLinear(
  174. hidden_size,
  175. self.head_dim,
  176. self.total_num_heads,
  177. self.total_num_kv_heads,
  178. bias=False,
  179. quant_config=quant_config,
  180. )
  181. self.o_proj = RowParallelLinear(
  182. self.total_num_heads * self.head_dim,
  183. hidden_size,
  184. bias=False,
  185. quant_config=quant_config,
  186. )
  187. self.rotary_emb = get_rope(
  188. self.head_dim,
  189. rotary_dim=self.head_dim,
  190. max_position=max_position,
  191. base=int(self.rope_theta),
  192. is_neox_style=True,
  193. )
  194. self.attn = Attention(self.num_heads,
  195. self.head_dim,
  196. self.scaling,
  197. num_kv_heads=self.num_kv_heads,
  198. sliding_window=self.sliding_window,
  199. cache_config=cache_config,
  200. quant_config=quant_config)
  201. def forward(
  202. self,
  203. positions: torch.Tensor,
  204. hidden_states: torch.Tensor,
  205. kv_cache: torch.Tensor,
  206. attn_metadata: AttentionMetadata,
  207. ) -> torch.Tensor:
  208. qkv, _ = self.qkv_proj(hidden_states)
  209. q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
  210. q, k = self.rotary_emb(positions, q, k)
  211. attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
  212. output, _ = self.o_proj(attn_output)
  213. return output
  214. class MixtralDecoderLayer(nn.Module):
  215. def __init__(
  216. self,
  217. config: MixtralConfig,
  218. cache_config: Optional[CacheConfig] = None,
  219. quant_config: Optional[QuantizationConfig] = None,
  220. ) -> None:
  221. super().__init__()
  222. self.hidden_size = config.hidden_size
  223. # Requires transformers > 4.32.0
  224. rope_theta = getattr(config, "rope_theta", 10000)
  225. self.self_attn = MixtralAttention(
  226. hidden_size=self.hidden_size,
  227. num_heads=config.num_attention_heads,
  228. max_position=config.max_position_embeddings,
  229. num_kv_heads=config.num_key_value_heads,
  230. rope_theta=rope_theta,
  231. sliding_window=config.sliding_window,
  232. cache_config=cache_config,
  233. quant_config=quant_config)
  234. self.block_sparse_moe = MixtralMoE(config=config,
  235. quant_config=quant_config)
  236. self.input_layernorm = RMSNorm(config.hidden_size,
  237. eps=config.rms_norm_eps)
  238. self.post_attention_layernorm = RMSNorm(config.hidden_size,
  239. eps=config.rms_norm_eps)
  240. def forward(
  241. self,
  242. positions: torch.Tensor,
  243. hidden_states: torch.Tensor,
  244. kv_cache: torch.Tensor,
  245. attn_metadata: AttentionMetadata,
  246. residual: Optional[torch.Tensor],
  247. ) -> torch.Tensor:
  248. # Self Attention
  249. if residual is None:
  250. residual = hidden_states
  251. hidden_states = self.input_layernorm(hidden_states)
  252. else:
  253. hidden_states, residual = self.input_layernorm(
  254. hidden_states, residual)
  255. hidden_states = self.self_attn(
  256. positions=positions,
  257. hidden_states=hidden_states,
  258. kv_cache=kv_cache,
  259. attn_metadata=attn_metadata,
  260. )
  261. # Fully Connected
  262. hidden_states, residual = self.post_attention_layernorm(
  263. hidden_states, residual)
  264. hidden_states = self.block_sparse_moe(hidden_states)
  265. return hidden_states, residual
  266. class MixtralModel(nn.Module):
  267. def __init__(
  268. self,
  269. config: MixtralConfig,
  270. cache_config: Optional[CacheConfig] = None,
  271. quant_config: Optional[QuantizationConfig] = None,
  272. ) -> None:
  273. super().__init__()
  274. self.padding_idx = config.pad_token_id
  275. self.vocab_size = config.vocab_size
  276. self.embed_tokens = VocabParallelEmbedding(
  277. config.vocab_size,
  278. config.hidden_size,
  279. )
  280. self.layers = nn.ModuleList([
  281. MixtralDecoderLayer(config,
  282. cache_config,
  283. quant_config=quant_config)
  284. for _ in range(config.num_hidden_layers)
  285. ])
  286. self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
  287. def forward(
  288. self,
  289. input_ids: torch.Tensor,
  290. positions: torch.Tensor,
  291. kv_caches: List[torch.Tensor],
  292. attn_metadata: AttentionMetadata,
  293. ) -> torch.Tensor:
  294. hidden_states = self.embed_tokens(input_ids)
  295. residual = None
  296. for i in range(len(self.layers)):
  297. layer = self.layers[i]
  298. hidden_states, residual = layer(positions, hidden_states,
  299. kv_caches[i], attn_metadata,
  300. residual)
  301. hidden_states, _ = self.norm(hidden_states, residual)
  302. return hidden_states
  303. class MixtralForCausalLM(nn.Module):
  304. fall_back_to_pt_during_load = False
  305. def __init__(
  306. self,
  307. config: MixtralConfig,
  308. cache_config: Optional[CacheConfig] = None,
  309. quant_config: Optional[QuantizationConfig] = None,
  310. ) -> None:
  311. super().__init__()
  312. self.config = config
  313. self.quant_config = quant_config
  314. self.model = MixtralModel(config, cache_config, quant_config)
  315. self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
  316. self.logits_processor = LogitsProcessor(config.vocab_size)
  317. self.sampler = Sampler()
  318. def forward(
  319. self,
  320. input_ids: torch.Tensor,
  321. positions: torch.Tensor,
  322. kv_caches: List[torch.Tensor],
  323. attn_metadata: AttentionMetadata,
  324. ) -> torch.Tensor:
  325. hidden_states = self.model(input_ids, positions, kv_caches,
  326. attn_metadata)
  327. return hidden_states
  328. def compute_logits(self, hidden_states: torch.Tensor,
  329. sampling_metadata: SamplingMetadata) -> torch.Tensor:
  330. logits = self.logits_processor(self.lm_head.weight, hidden_states,
  331. sampling_metadata)
  332. return logits
  333. def sample(
  334. self,
  335. logits: Optional[torch.Tensor],
  336. sampling_metadata: SamplingMetadata,
  337. ) -> Optional[SamplerOutput]:
  338. next_tokens = self.sampler(logits, sampling_metadata)
  339. return next_tokens
  340. def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
  341. stacked_params_mapping = [
  342. # (param_name, shard_name, shard_id)
  343. ("qkv_proj", "q_proj", "q"),
  344. ("qkv_proj", "k_proj", "k"),
  345. ("qkv_proj", "v_proj", "v"),
  346. ]
  347. params_dict = dict(self.named_parameters())
  348. for name, loaded_weight in weights:
  349. if "rotary_emb.inv_freq" in name:
  350. continue
  351. for (param_name, weight_name, shard_id) in stacked_params_mapping:
  352. if weight_name not in name:
  353. continue
  354. name = name.replace(weight_name, param_name)
  355. # Skip loading extra bias for GPTQ models.
  356. if name.endswith(".bias") and name not in params_dict:
  357. continue
  358. param = params_dict[name]
  359. weight_loader = param.weight_loader
  360. weight_loader(param, loaded_weight, shard_id)
  361. break
  362. else:
  363. # Skip loading extra bias for GPTQ models.
  364. if name.endswith(".bias") and name not in params_dict:
  365. continue
  366. # Skip experts that are not assigned to this worker.
  367. if ("block_sparse_moe.experts." in name
  368. and name not in params_dict):
  369. continue
  370. param = params_dict[name]
  371. weight_loader = getattr(param, "weight_loader",
  372. default_weight_loader)
  373. weight_loader(param, loaded_weight)