arctic.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537
  1. """Inference-only Snowflake Arctic model."""
  2. from typing import Iterable, List, Optional, Tuple
  3. import torch
  4. from loguru import logger
  5. from torch import nn
  6. from aphrodite.attention import Attention, AttentionMetadata
  7. from aphrodite.common.config import CacheConfig
  8. from aphrodite.common.sequence import IntermediateTensors, SamplerOutput
  9. from aphrodite.common.utils import progress_bar
  10. from aphrodite.distributed import (get_tensor_model_parallel_rank,
  11. get_tensor_model_parallel_world_size,
  12. tensor_model_parallel_all_reduce)
  13. from aphrodite.modeling.layers.activation import SiluAndMul
  14. from aphrodite.modeling.layers.fused_moe import fused_experts, fused_topk
  15. from aphrodite.modeling.layers.layernorm import RMSNorm
  16. from aphrodite.modeling.layers.linear import (MergedColumnParallelLinear,
  17. QKVParallelLinear,
  18. ReplicatedLinear,
  19. RowParallelLinear)
  20. from aphrodite.modeling.layers.logits_processor import LogitsProcessor
  21. from aphrodite.modeling.layers.rotary_embedding import get_rope
  22. from aphrodite.modeling.layers.sampler import Sampler
  23. from aphrodite.modeling.layers.vocab_parallel_embedding import (
  24. ParallelLMHead, VocabParallelEmbedding)
  25. from aphrodite.modeling.model_loader.weight_utils import default_weight_loader
  26. from aphrodite.modeling.sampling_metadata import SamplingMetadata
  27. from aphrodite.modeling.utils import set_weight_attrs
  28. from aphrodite.quantization.base_config import QuantizationConfig
  29. from aphrodite.quantization.deepspeedfp import (DeepSpeedFPConfig,
  30. DeepSpeedFPParameter)
  31. from aphrodite.transformers_utils.configs.arctic import ArcticConfig
  32. class ArcticMLP(nn.Module):
  33. def __init__(self,
  34. config: ArcticConfig,
  35. layer_id: int,
  36. expert_id: int = -1,
  37. is_residual_mlp: bool = False,
  38. quant_config: Optional[QuantizationConfig] = None,
  39. reduce_results: bool = True):
  40. super(ArcticMLP, self).__init__()
  41. self.hidden_size = config.hidden_size
  42. self.expert_id = expert_id
  43. self.layer_id = layer_id
  44. self.ffn_dim = config.intermediate_size if not is_residual_mlp \
  45. else self.hidden_size
  46. self.w13 = MergedColumnParallelLinear(self.hidden_size,
  47. [self.ffn_dim] * 2,
  48. bias=False,
  49. quant_config=quant_config)
  50. self.w2 = RowParallelLinear(self.ffn_dim,
  51. self.hidden_size,
  52. bias=False,
  53. reduce_results=reduce_results,
  54. quant_config=quant_config)
  55. if config.hidden_act != "silu":
  56. raise ValueError(f"Unsupported activation: {config.hidden_act}. "
  57. "Only silu is supported for now.")
  58. self.act_fn = SiluAndMul()
  59. def forward(self, hidden_states):
  60. gate_up, _ = self.w13(hidden_states)
  61. hidden_states = self.act_fn(gate_up)
  62. hidden_states, _ = self.w2(hidden_states)
  63. return hidden_states
  64. class ArcticMoE(nn.Module):
  65. """
  66. Model-parallel implementation of Arctic MoE Layer.
  67. """
  68. def __init__(self,
  69. config: ArcticConfig,
  70. layer_id: int,
  71. tp_size: Optional[int] = None,
  72. params_dtype: Optional[torch.dtype] = None,
  73. quant_config: Optional[QuantizationConfig] = None,
  74. reduce_results: bool = True):
  75. super(ArcticMoE, self).__init__()
  76. self.tp_size = tp_size or get_tensor_model_parallel_world_size()
  77. self.hidden_size = config.hidden_size
  78. self.num_experts = config.num_local_experts
  79. self.layer_id = layer_id
  80. self.top_k = config.num_experts_per_tok
  81. self.intermediate_size = config.intermediate_size // self.tp_size
  82. self.is_moe_layer = (layer_id + 1) % config.moe_layer_frequency == 0
  83. self.is_quant = isinstance(quant_config, DeepSpeedFPConfig)
  84. self.reduce_results = reduce_results
  85. # Some other parameters
  86. if params_dtype is None:
  87. params_dtype = torch.get_default_dtype()
  88. self.params_dtype = params_dtype
  89. if not self.is_moe_layer:
  90. self.mlp = ArcticMLP(config,
  91. layer_id=layer_id,
  92. quant_config=quant_config,
  93. reduce_results=reduce_results)
  94. else:
  95. self.gate = ReplicatedLinear(self.hidden_size,
  96. self.num_experts,
  97. bias=False,
  98. params_dtype=self.params_dtype,
  99. quant_config=quant_config)
  100. if self.is_quant:
  101. self.ws = DeepSpeedFPParameter(
  102. torch.Size((self.num_experts, 2 * self.intermediate_size,
  103. self.hidden_size)),
  104. params_dtype=params_dtype,
  105. quant_config=quant_config,
  106. )
  107. self.w2s = DeepSpeedFPParameter(
  108. torch.Size((self.num_experts, self.hidden_size,
  109. self.intermediate_size)),
  110. params_dtype=params_dtype,
  111. quant_config=quant_config,
  112. )
  113. else:
  114. self.ws = nn.Parameter(
  115. torch.empty(self.num_experts,
  116. 2 * self.intermediate_size,
  117. self.hidden_size,
  118. device="cuda",
  119. dtype=self.params_dtype))
  120. self.w2s = nn.Parameter(
  121. torch.empty(self.num_experts,
  122. self.hidden_size,
  123. self.intermediate_size,
  124. device="cuda",
  125. dtype=self.params_dtype))
  126. set_weight_attrs(self.ws, {
  127. "weight_loader": self.weight_loader,
  128. })
  129. set_weight_attrs(self.w2s, {
  130. "weight_loader": self.weight_loader,
  131. })
  132. def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
  133. weight_name: str, expert_id: int):
  134. tp_rank = get_tensor_model_parallel_rank()
  135. param_data = param.ds_dequantize() if self.is_quant else param.data
  136. shard_size = self.intermediate_size
  137. shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
  138. if weight_name.endswith("w1.weight"):
  139. param_data[expert_id, 0:shard_size, :] = loaded_weight[shard, :]
  140. if weight_name.endswith("w3.weight"):
  141. param_data[expert_id,
  142. shard_size:2 * shard_size, :] = loaded_weight[shard, :]
  143. if weight_name.endswith("w2.weight"):
  144. param_data[expert_id, :, :] = loaded_weight[:, shard]
  145. if self.is_quant:
  146. param.ds_quantize_(param_data)
  147. def local_moe_fused(self, hidden_states: torch.Tensor) -> torch.Tensor:
  148. num_tokens, hidden_size = hidden_states.shape
  149. hidden_states = hidden_states.view(-1, self.hidden_size)
  150. # router_logits: (num_tokens, n_experts)
  151. router_logits, _ = self.gate(hidden_states)
  152. do_normalize = self.top_k > 1
  153. topk_weights, topk_ids = fused_topk(hidden_states,
  154. router_logits,
  155. self.top_k,
  156. renormalize=do_normalize)
  157. # topk_ids: (num_tokens, k)
  158. if self.is_quant:
  159. if 2 * num_tokens <= self.num_experts:
  160. # If much fewer tokens than experts, use selective dequantize.
  161. ws_dequantized = self.ws.ds_selective_dequantize(
  162. topk_ids.flatten())
  163. w2s_dequantized = self.w2s.ds_selective_dequantize(
  164. topk_ids.flatten())
  165. # We gathered the experts to the tokens so update the mapping.
  166. topk_ids = torch.arange(
  167. 0,
  168. topk_ids.numel(),
  169. device=topk_ids.device,
  170. ).reshape(topk_ids.shape)
  171. else:
  172. ws_dequantized = self.ws.ds_dequantize()
  173. w2s_dequantized = self.w2s.ds_dequantize()
  174. final_hidden_states = fused_experts(
  175. hidden_states,
  176. ws_dequantized if self.is_quant else self.ws,
  177. w2s_dequantized if self.is_quant else self.w2s,
  178. topk_weights,
  179. topk_ids,
  180. inplace=True)
  181. if self.reduce_results and self.tp_size > 1:
  182. final_hidden_states = tensor_model_parallel_all_reduce(
  183. final_hidden_states)
  184. return final_hidden_states.view(num_tokens, hidden_size)
  185. def forward(self, hidden_states: torch.Tensor):
  186. if self.is_moe_layer:
  187. final_hidden_states = self.local_moe_fused(hidden_states)
  188. else:
  189. final_hidden_states = self.mlp(hidden_states)
  190. return final_hidden_states
  191. class ArcticAttention(nn.Module):
  192. def __init__(
  193. self,
  194. config: ArcticConfig,
  195. layer_idx: Optional[int] = None,
  196. cache_config: Optional[CacheConfig] = None,
  197. quant_config: Optional[QuantizationConfig] = None,
  198. ):
  199. super().__init__()
  200. self.config = config
  201. self.layer_idx = layer_idx
  202. self.hidden_size = config.hidden_size
  203. tp_size = get_tensor_model_parallel_world_size()
  204. self.total_num_heads = config.num_attention_heads
  205. assert self.total_num_heads % tp_size == 0
  206. self.num_heads = self.total_num_heads // tp_size
  207. self.total_num_kv_heads = config.num_key_value_heads
  208. if self.total_num_kv_heads >= tp_size:
  209. assert self.total_num_kv_heads % tp_size == 0
  210. else:
  211. assert tp_size % self.total_num_kv_heads == 0
  212. self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
  213. self.head_dim = self.hidden_size // self.total_num_heads
  214. self.q_size = self.num_heads * self.head_dim
  215. self.kv_size = self.num_kv_heads * self.head_dim
  216. self.max_position_embeddings = config.max_position_embeddings
  217. self.rope_theta = config.rope_theta
  218. self.scaling = self.head_dim**-0.5
  219. self.qkv_proj = QKVParallelLinear(self.hidden_size,
  220. self.head_dim,
  221. self.total_num_heads,
  222. self.total_num_kv_heads,
  223. bias=False,
  224. quant_config=quant_config)
  225. self.o_proj = RowParallelLinear(
  226. self.total_num_heads * self.head_dim,
  227. self.hidden_size,
  228. bias=False,
  229. reduce_results=True,
  230. quant_config=quant_config,
  231. )
  232. self.rotary_emb = get_rope(
  233. self.head_dim,
  234. rotary_dim=self.head_dim,
  235. max_position=self.max_position_embeddings,
  236. base=int(self.rope_theta),
  237. is_neox_style=True,
  238. )
  239. self.attn = Attention(self.num_heads,
  240. self.head_dim,
  241. self.scaling,
  242. num_kv_heads=self.num_kv_heads,
  243. cache_config=cache_config,
  244. quant_config=quant_config)
  245. def forward(
  246. self,
  247. positions: torch.Tensor,
  248. hidden_states: torch.Tensor,
  249. kv_cache: torch.Tensor,
  250. attn_metadata: AttentionMetadata,
  251. ) -> torch.Tensor:
  252. qkv, _ = self.qkv_proj(hidden_states)
  253. q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
  254. q, k = self.rotary_emb(positions, q, k)
  255. attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
  256. output, _ = self.o_proj(attn_output)
  257. return output
  258. class ArcticDecoderLayer(nn.Module):
  259. def __init__(
  260. self,
  261. config: ArcticConfig,
  262. layer_idx: int,
  263. cache_config: Optional[CacheConfig] = None,
  264. quant_config: Optional[QuantizationConfig] = None,
  265. ) -> None:
  266. super().__init__()
  267. self.layer_idx = layer_idx
  268. self.hidden_size = config.hidden_size
  269. is_moe_layer = (layer_idx + 1) % config.moe_layer_frequency == 0
  270. self.use_residual = config.use_residual and is_moe_layer
  271. self.self_attn = ArcticAttention(config,
  272. layer_idx,
  273. cache_config,
  274. quant_config=quant_config)
  275. self.block_sparse_moe = ArcticMoE(
  276. config,
  277. layer_id=layer_idx,
  278. quant_config=quant_config,
  279. reduce_results=(not self.use_residual))
  280. self.input_layernorm = RMSNorm(config.hidden_size,
  281. eps=config.rms_norm_eps)
  282. self.post_attention_layernorm = RMSNorm(config.hidden_size,
  283. eps=config.rms_norm_eps)
  284. if self.use_residual:
  285. self.residual_layernorm = RMSNorm(config.hidden_size,
  286. eps=config.rms_norm_eps)
  287. self.residual_mlp = ArcticMLP(config,
  288. layer_id=layer_idx,
  289. is_residual_mlp=True,
  290. reduce_results=False)
  291. def forward(
  292. self,
  293. positions: torch.Tensor,
  294. hidden_states: torch.Tensor,
  295. kv_cache: torch.Tensor,
  296. attn_metadata: AttentionMetadata,
  297. ) -> torch.Tensor:
  298. residual_input = hidden_states
  299. hidden_states = self.input_layernorm(hidden_states)
  300. hidden_states = self.self_attn(
  301. positions=positions,
  302. hidden_states=hidden_states,
  303. kv_cache=kv_cache,
  304. attn_metadata=attn_metadata,
  305. )
  306. hidden_states = residual_input + hidden_states
  307. residual_attn = hidden_states
  308. if self.use_residual:
  309. hidden_states = self.residual_layernorm(hidden_states)
  310. hidden_states = self.residual_mlp(hidden_states)
  311. residual_mlp = hidden_states
  312. hidden_states = self.post_attention_layernorm(residual_input)
  313. hidden_states = self.block_sparse_moe(hidden_states)
  314. hidden_states = residual_mlp + hidden_states
  315. hidden_states = tensor_model_parallel_all_reduce(hidden_states)
  316. hidden_states = residual_attn + hidden_states
  317. else:
  318. hidden_states = self.post_attention_layernorm(hidden_states)
  319. hidden_states = self.block_sparse_moe(hidden_states)
  320. hidden_states = residual_attn + hidden_states
  321. return hidden_states
  322. class ArcticModel(nn.Module):
  323. def __init__(
  324. self,
  325. config: ArcticConfig,
  326. cache_config: Optional[CacheConfig] = None,
  327. quant_config: Optional[QuantizationConfig] = None,
  328. ) -> None:
  329. super().__init__()
  330. self.padding_idx = config.pad_token_id
  331. self.vocab_size = config.vocab_size
  332. self.embed_tokens = VocabParallelEmbedding(
  333. self.vocab_size,
  334. config.hidden_size,
  335. org_num_embeddings=self.vocab_size)
  336. self.layers = nn.ModuleList([
  337. ArcticDecoderLayer(config,
  338. layer_idx,
  339. cache_config,
  340. quant_config=quant_config)
  341. for layer_idx in range(config.num_hidden_layers)
  342. ])
  343. self._attn_implementation = config._attn_implementation
  344. self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
  345. def forward(
  346. self,
  347. input_ids: torch.Tensor,
  348. positions: torch.Tensor,
  349. kv_caches: List[torch.Tensor],
  350. attn_metadata: AttentionMetadata,
  351. ) -> torch.Tensor:
  352. hidden_states = self.embed_tokens(input_ids)
  353. for i in range(len(self.layers)):
  354. layer = self.layers[i]
  355. hidden_states = layer(positions, hidden_states, kv_caches[i],
  356. attn_metadata)
  357. hidden_states = self.norm(hidden_states)
  358. return hidden_states
  359. class ArcticForCausalLM(nn.Module):
  360. def __init__(self,
  361. config: ArcticConfig,
  362. cache_config: Optional[CacheConfig] = None,
  363. quant_config: Optional[QuantizationConfig] = None,
  364. **kwargs) -> None:
  365. super().__init__()
  366. self.config = config
  367. self.model = ArcticModel(config, cache_config, quant_config)
  368. self.vocab_size = config.vocab_size
  369. self.lm_head = ParallelLMHead(
  370. self.vocab_size,
  371. config.hidden_size,
  372. quant_config=quant_config,
  373. )
  374. self.num_experts = config.num_local_experts
  375. self.num_experts_per_tok = config.num_experts_per_tok
  376. self.unpadded_vocab_size = config.vocab_size
  377. self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
  378. config.vocab_size)
  379. self.sampler = Sampler()
  380. def forward(
  381. self,
  382. input_ids: torch.Tensor,
  383. positions: torch.Tensor,
  384. kv_caches: List[torch.Tensor],
  385. attn_metadata: AttentionMetadata,
  386. intermediate_tensors: Optional[IntermediateTensors] = None,
  387. ) -> torch.Tensor:
  388. hidden_states = self.model(input_ids, positions, kv_caches,
  389. attn_metadata)
  390. return hidden_states
  391. def compute_logits(
  392. self,
  393. hidden_states: torch.Tensor,
  394. sampling_metadata: SamplingMetadata,
  395. ) -> Optional[torch.Tensor]:
  396. logits = self.logits_processor(self.lm_head, hidden_states,
  397. sampling_metadata)
  398. return logits
  399. def sample(
  400. self,
  401. logits: Optional[torch.Tensor],
  402. sampling_metadata: SamplingMetadata,
  403. ) -> Optional[SamplerOutput]:
  404. next_tokens = self.sampler(logits, sampling_metadata)
  405. return next_tokens
  406. def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
  407. stacked_params_mapping = [
  408. # (param_name, shard_name, shard_id)
  409. ("qkv_proj", "q_proj", "q"),
  410. ("qkv_proj", "k_proj", "k"),
  411. ("qkv_proj", "v_proj", "v"),
  412. ]
  413. mlp_params_mapping = []
  414. expert_params_mapping = []
  415. num_layers = self.config.num_hidden_layers
  416. for layer in range(num_layers):
  417. mlp_params_mapping.append(
  418. (f"layers.{layer}.residual_mlp.w13.weight",
  419. f"layers.{layer}.residual_mlp.w1.weight", 0))
  420. mlp_params_mapping.append(
  421. (f"layers.{layer}.residual_mlp.w13.weight",
  422. f"layers.{layer}.residual_mlp.w3.weight", 1))
  423. if layer % 2 == 0:
  424. # MLP layers
  425. mlp_params_mapping.append(
  426. (f"layers.{layer}.block_sparse_moe.mlp.w13.weight",
  427. f"layers.{layer}.block_sparse_moe.mlp.w1.weight", 0))
  428. mlp_params_mapping.append(
  429. (f"layers.{layer}.block_sparse_moe.mlp.w13.weight",
  430. f"layers.{layer}.block_sparse_moe.mlp.w3.weight", 1))
  431. else:
  432. # MoE layers
  433. for expert_id in range(self.config.num_local_experts):
  434. expert_params_mapping.append(
  435. ("ws", f"experts.{expert_id}.w1.weight", expert_id))
  436. expert_params_mapping.append(
  437. ("w2s", f"experts.{expert_id}.w2.weight", expert_id))
  438. expert_params_mapping.append(
  439. ("ws", f"experts.{expert_id}.w3.weight", expert_id))
  440. params_dict = dict(self.named_parameters())
  441. logger.info(
  442. "It will take ~10 minutes loading from the 16-bit weights. "
  443. "Alternatively, use the prequantized 8-bit weights of arctic "
  444. "and set load-format to `sharded_state` will accelerate loading.")
  445. weights_list = list(weights)
  446. for name, loaded_weight in progress_bar(weights_list,
  447. desc="Loading modules..."):
  448. for (param_name, weight_name, shard_id) in stacked_params_mapping:
  449. if weight_name not in name:
  450. continue
  451. name = name.replace(weight_name, param_name)
  452. # Skip loading extra bias for GPTQ models.
  453. if name.endswith(".bias") and name not in params_dict:
  454. continue
  455. param = params_dict[name]
  456. weight_loader = param.weight_loader
  457. weight_loader(param, loaded_weight, shard_id)
  458. break
  459. else:
  460. for param_name, weight_name, shard_id in mlp_params_mapping:
  461. if weight_name not in name:
  462. continue
  463. name = name.replace(weight_name, param_name)
  464. param = params_dict[name]
  465. weight_loader = param.weight_loader
  466. weight_loader(param, loaded_weight, shard_id)
  467. break
  468. else:
  469. for param_name, weight_name, shard_id \
  470. in expert_params_mapping:
  471. if weight_name not in name:
  472. continue
  473. name = name.replace(weight_name, param_name)
  474. param = params_dict[name]
  475. weight_loader = param.weight_loader
  476. weight_loader(param,
  477. loaded_weight,
  478. weight_name,
  479. expert_id=shard_id)
  480. break
  481. else:
  482. if name.endswith(".bias") and name not in params_dict:
  483. continue
  484. param = params_dict[name]
  485. weight_loader = getattr(param, "weight_loader",
  486. default_weight_loader)
  487. weight_loader(param, loaded_weight)