arctic.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531
  1. """Inference-only Snowflake Arctic model."""
  2. from typing import Iterable, List, Optional, Tuple
  3. import torch
  4. from loguru import logger
  5. from torch import nn
  6. from aphrodite.attention import Attention, AttentionMetadata
  7. from aphrodite.common.config import CacheConfig
  8. from aphrodite.common.sequence import IntermediateTensors, SamplerOutput
  9. from aphrodite.distributed import (get_tensor_model_parallel_rank,
  10. get_tensor_model_parallel_world_size,
  11. tensor_model_parallel_all_reduce)
  12. from aphrodite.modeling.layers.activation import SiluAndMul
  13. from aphrodite.modeling.layers.fused_moe import fused_experts, fused_topk
  14. from aphrodite.modeling.layers.layernorm import RMSNorm
  15. from aphrodite.modeling.layers.linear import (MergedColumnParallelLinear,
  16. QKVParallelLinear,
  17. ReplicatedLinear,
  18. RowParallelLinear)
  19. from aphrodite.modeling.layers.logits_processor import LogitsProcessor
  20. from aphrodite.modeling.layers.rotary_embedding import get_rope
  21. from aphrodite.modeling.layers.sampler import Sampler
  22. from aphrodite.modeling.layers.vocab_parallel_embedding import (
  23. ParallelLMHead, VocabParallelEmbedding)
  24. from aphrodite.modeling.model_loader.weight_utils import default_weight_loader
  25. from aphrodite.modeling.sampling_metadata import SamplingMetadata
  26. from aphrodite.modeling.utils import set_weight_attrs
  27. from aphrodite.quantization.base_config import QuantizationConfig
  28. from aphrodite.quantization.deepspeedfp import (DeepSpeedFPConfig,
  29. DeepSpeedFPParameter)
  30. from aphrodite.transformers_utils.configs.arctic import ArcticConfig
  31. class ArcticMLP(nn.Module):
  32. def __init__(self,
  33. config: ArcticConfig,
  34. layer_id: int,
  35. expert_id: int = -1,
  36. is_residual_mlp: bool = False,
  37. quant_config: Optional[QuantizationConfig] = None,
  38. reduce_results: bool = True):
  39. super(ArcticMLP, self).__init__()
  40. self.hidden_size = config.hidden_size
  41. self.expert_id = expert_id
  42. self.layer_id = layer_id
  43. self.ffn_dim = config.intermediate_size if not is_residual_mlp \
  44. else self.hidden_size
  45. self.w13 = MergedColumnParallelLinear(self.hidden_size,
  46. [self.ffn_dim] * 2,
  47. bias=False,
  48. quant_config=quant_config)
  49. self.w2 = RowParallelLinear(self.ffn_dim,
  50. self.hidden_size,
  51. bias=False,
  52. reduce_results=reduce_results,
  53. quant_config=quant_config)
  54. if config.hidden_act != "silu":
  55. raise ValueError(f"Unsupported activation: {config.hidden_act}. "
  56. "Only silu is supported for now.")
  57. self.act_fn = SiluAndMul()
  58. def forward(self, hidden_states):
  59. gate_up, _ = self.w13(hidden_states)
  60. hidden_states = self.act_fn(gate_up)
  61. hidden_states, _ = self.w2(hidden_states)
  62. return hidden_states
  63. class ArcticMoE(nn.Module):
  64. """
  65. Model-parallel implementation of Arctic MoE Layer.
  66. """
  67. def __init__(self,
  68. config: ArcticConfig,
  69. layer_id: int,
  70. tp_size: Optional[int] = None,
  71. params_dtype: Optional[torch.dtype] = None,
  72. quant_config: Optional[QuantizationConfig] = None,
  73. reduce_results: bool = True):
  74. super(ArcticMoE, self).__init__()
  75. self.tp_size = tp_size or get_tensor_model_parallel_world_size()
  76. self.hidden_size = config.hidden_size
  77. self.num_experts = config.num_local_experts
  78. self.layer_id = layer_id
  79. self.top_k = config.num_experts_per_tok
  80. self.intermediate_size = config.intermediate_size // self.tp_size
  81. self.is_moe_layer = (layer_id + 1) % config.moe_layer_frequency == 0
  82. self.is_quant = isinstance(quant_config, DeepSpeedFPConfig)
  83. self.reduce_results = reduce_results
  84. # Some other parameters
  85. if params_dtype is None:
  86. params_dtype = torch.get_default_dtype()
  87. self.params_dtype = params_dtype
  88. if not self.is_moe_layer:
  89. self.mlp = ArcticMLP(config,
  90. layer_id=layer_id,
  91. quant_config=quant_config,
  92. reduce_results=reduce_results)
  93. else:
  94. self.gate = ReplicatedLinear(self.hidden_size,
  95. self.num_experts,
  96. bias=False,
  97. params_dtype=self.params_dtype,
  98. quant_config=quant_config)
  99. if self.is_quant:
  100. self.ws = DeepSpeedFPParameter(
  101. torch.Size((self.num_experts, 2 * self.intermediate_size,
  102. self.hidden_size)),
  103. params_dtype=params_dtype,
  104. quant_config=quant_config,
  105. )
  106. self.w2s = DeepSpeedFPParameter(
  107. torch.Size((self.num_experts, self.hidden_size,
  108. self.intermediate_size)),
  109. params_dtype=params_dtype,
  110. quant_config=quant_config,
  111. )
  112. else:
  113. self.ws = nn.Parameter(
  114. torch.empty(self.num_experts,
  115. 2 * self.intermediate_size,
  116. self.hidden_size,
  117. device="cuda",
  118. dtype=self.params_dtype))
  119. self.w2s = nn.Parameter(
  120. torch.empty(self.num_experts,
  121. self.hidden_size,
  122. self.intermediate_size,
  123. device="cuda",
  124. dtype=self.params_dtype))
  125. set_weight_attrs(self.ws, {
  126. "weight_loader": self.weight_loader,
  127. })
  128. set_weight_attrs(self.w2s, {
  129. "weight_loader": self.weight_loader,
  130. })
  131. def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
  132. weight_name: str, expert_id: int):
  133. tp_rank = get_tensor_model_parallel_rank()
  134. param_data = param.ds_dequantize() if self.is_quant else param.data
  135. shard_size = self.intermediate_size
  136. shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
  137. if weight_name.endswith("w1.weight"):
  138. param_data[expert_id, 0:shard_size, :] = loaded_weight[shard, :]
  139. if weight_name.endswith("w3.weight"):
  140. param_data[expert_id,
  141. shard_size:2 * shard_size, :] = loaded_weight[shard, :]
  142. if weight_name.endswith("w2.weight"):
  143. param_data[expert_id, :, :] = loaded_weight[:, shard]
  144. if self.is_quant:
  145. param.ds_quantize_(param_data)
  146. def local_moe_fused(self, hidden_states: torch.Tensor) -> torch.Tensor:
  147. num_tokens, hidden_size = hidden_states.shape
  148. hidden_states = hidden_states.view(-1, self.hidden_size)
  149. # router_logits: (num_tokens, n_experts)
  150. router_logits, _ = self.gate(hidden_states)
  151. do_normalize = self.top_k > 1
  152. topk_weights, topk_ids = fused_topk(hidden_states,
  153. router_logits,
  154. self.top_k,
  155. renormalize=do_normalize)
  156. # topk_ids: (num_tokens, k)
  157. if self.is_quant:
  158. if 2 * num_tokens <= self.num_experts:
  159. # If much fewer tokens than experts, use selective dequantize.
  160. ws_dequantized = self.ws.ds_selective_dequantize(
  161. topk_ids.flatten())
  162. w2s_dequantized = self.w2s.ds_selective_dequantize(
  163. topk_ids.flatten())
  164. # We gathered the experts to the tokens so update the mapping.
  165. topk_ids = torch.arange(
  166. 0,
  167. topk_ids.numel(),
  168. device=topk_ids.device,
  169. ).reshape(topk_ids.shape)
  170. else:
  171. ws_dequantized = self.ws.ds_dequantize()
  172. w2s_dequantized = self.w2s.ds_dequantize()
  173. final_hidden_states = fused_experts(
  174. hidden_states,
  175. ws_dequantized if self.is_quant else self.ws,
  176. w2s_dequantized if self.is_quant else self.w2s,
  177. topk_weights,
  178. topk_ids,
  179. inplace=True)
  180. if self.reduce_results and self.tp_size > 1:
  181. final_hidden_states = tensor_model_parallel_all_reduce(
  182. final_hidden_states)
  183. return final_hidden_states.view(num_tokens, hidden_size)
  184. def forward(self, hidden_states: torch.Tensor):
  185. if self.is_moe_layer:
  186. final_hidden_states = self.local_moe_fused(hidden_states)
  187. else:
  188. final_hidden_states = self.mlp(hidden_states)
  189. return final_hidden_states
  190. class ArcticAttention(nn.Module):
  191. def __init__(
  192. self,
  193. config: ArcticConfig,
  194. layer_idx: Optional[int] = None,
  195. cache_config: Optional[CacheConfig] = None,
  196. quant_config: Optional[QuantizationConfig] = None,
  197. ):
  198. super().__init__()
  199. self.config = config
  200. self.layer_idx = layer_idx
  201. self.hidden_size = config.hidden_size
  202. tp_size = get_tensor_model_parallel_world_size()
  203. self.total_num_heads = config.num_attention_heads
  204. assert self.total_num_heads % tp_size == 0
  205. self.num_heads = self.total_num_heads // tp_size
  206. self.total_num_kv_heads = config.num_key_value_heads
  207. if self.total_num_kv_heads >= tp_size:
  208. assert self.total_num_kv_heads % tp_size == 0
  209. else:
  210. assert tp_size % self.total_num_kv_heads == 0
  211. self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
  212. self.head_dim = self.hidden_size // self.total_num_heads
  213. self.q_size = self.num_heads * self.head_dim
  214. self.kv_size = self.num_kv_heads * self.head_dim
  215. self.max_position_embeddings = config.max_position_embeddings
  216. self.rope_theta = config.rope_theta
  217. self.scaling = self.head_dim**-0.5
  218. self.qkv_proj = QKVParallelLinear(self.hidden_size,
  219. self.head_dim,
  220. self.total_num_heads,
  221. self.total_num_kv_heads,
  222. bias=False,
  223. quant_config=quant_config)
  224. self.o_proj = RowParallelLinear(
  225. self.total_num_heads * self.head_dim,
  226. self.hidden_size,
  227. bias=False,
  228. reduce_results=True,
  229. quant_config=quant_config,
  230. )
  231. self.rotary_emb = get_rope(
  232. self.head_dim,
  233. rotary_dim=self.head_dim,
  234. max_position=self.max_position_embeddings,
  235. base=int(self.rope_theta),
  236. is_neox_style=True,
  237. )
  238. self.attn = Attention(self.num_heads,
  239. self.head_dim,
  240. self.scaling,
  241. num_kv_heads=self.num_kv_heads,
  242. cache_config=cache_config,
  243. quant_config=quant_config)
  244. def forward(
  245. self,
  246. positions: torch.Tensor,
  247. hidden_states: torch.Tensor,
  248. kv_cache: torch.Tensor,
  249. attn_metadata: AttentionMetadata,
  250. ) -> torch.Tensor:
  251. qkv, _ = self.qkv_proj(hidden_states)
  252. q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
  253. q, k = self.rotary_emb(positions, q, k)
  254. attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
  255. output, _ = self.o_proj(attn_output)
  256. return output
  257. class ArcticDecoderLayer(nn.Module):
  258. def __init__(
  259. self,
  260. config: ArcticConfig,
  261. layer_idx: int,
  262. cache_config: Optional[CacheConfig] = None,
  263. quant_config: Optional[QuantizationConfig] = None,
  264. ) -> None:
  265. super().__init__()
  266. self.layer_idx = layer_idx
  267. self.hidden_size = config.hidden_size
  268. is_moe_layer = (layer_idx + 1) % config.moe_layer_frequency == 0
  269. self.use_residual = config.use_residual and is_moe_layer
  270. self.self_attn = ArcticAttention(config,
  271. layer_idx,
  272. cache_config,
  273. quant_config=quant_config)
  274. self.block_sparse_moe = ArcticMoE(
  275. config,
  276. layer_id=layer_idx,
  277. quant_config=quant_config,
  278. reduce_results=(not self.use_residual))
  279. self.input_layernorm = RMSNorm(config.hidden_size,
  280. eps=config.rms_norm_eps)
  281. self.post_attention_layernorm = RMSNorm(config.hidden_size,
  282. eps=config.rms_norm_eps)
  283. if self.use_residual:
  284. self.residual_layernorm = RMSNorm(config.hidden_size,
  285. eps=config.rms_norm_eps)
  286. self.residual_mlp = ArcticMLP(config,
  287. layer_id=layer_idx,
  288. is_residual_mlp=True,
  289. reduce_results=False)
  290. def forward(
  291. self,
  292. positions: torch.Tensor,
  293. hidden_states: torch.Tensor,
  294. kv_cache: torch.Tensor,
  295. attn_metadata: AttentionMetadata,
  296. ) -> torch.Tensor:
  297. residual_input = hidden_states
  298. hidden_states = self.input_layernorm(hidden_states)
  299. hidden_states = self.self_attn(
  300. positions=positions,
  301. hidden_states=hidden_states,
  302. kv_cache=kv_cache,
  303. attn_metadata=attn_metadata,
  304. )
  305. hidden_states = residual_input + hidden_states
  306. residual_attn = hidden_states
  307. if self.use_residual:
  308. hidden_states = self.residual_layernorm(hidden_states)
  309. hidden_states = self.residual_mlp(hidden_states)
  310. residual_mlp = hidden_states
  311. hidden_states = self.post_attention_layernorm(residual_input)
  312. hidden_states = self.block_sparse_moe(hidden_states)
  313. hidden_states = residual_mlp + hidden_states
  314. hidden_states = tensor_model_parallel_all_reduce(hidden_states)
  315. hidden_states = residual_attn + hidden_states
  316. else:
  317. hidden_states = self.post_attention_layernorm(hidden_states)
  318. hidden_states = self.block_sparse_moe(hidden_states)
  319. hidden_states = residual_attn + hidden_states
  320. return hidden_states
  321. class ArcticModel(nn.Module):
  322. def __init__(
  323. self,
  324. config: ArcticConfig,
  325. cache_config: Optional[CacheConfig] = None,
  326. quant_config: Optional[QuantizationConfig] = None,
  327. ) -> None:
  328. super().__init__()
  329. self.padding_idx = config.pad_token_id
  330. self.vocab_size = config.vocab_size
  331. self.embed_tokens = VocabParallelEmbedding(
  332. self.vocab_size,
  333. config.hidden_size,
  334. org_num_embeddings=self.vocab_size)
  335. self.layers = nn.ModuleList([
  336. ArcticDecoderLayer(config,
  337. layer_idx,
  338. cache_config,
  339. quant_config=quant_config)
  340. for layer_idx in range(config.num_hidden_layers)
  341. ])
  342. self._attn_implementation = config._attn_implementation
  343. self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
  344. def forward(
  345. self,
  346. input_ids: torch.Tensor,
  347. positions: torch.Tensor,
  348. kv_caches: List[torch.Tensor],
  349. attn_metadata: AttentionMetadata,
  350. ) -> torch.Tensor:
  351. hidden_states = self.embed_tokens(input_ids)
  352. for i in range(len(self.layers)):
  353. layer = self.layers[i]
  354. hidden_states = layer(positions, hidden_states, kv_caches[i],
  355. attn_metadata)
  356. hidden_states = self.norm(hidden_states)
  357. return hidden_states
  358. class ArcticForCausalLM(nn.Module):
  359. def __init__(self,
  360. config: ArcticConfig,
  361. cache_config: Optional[CacheConfig] = None,
  362. quant_config: Optional[QuantizationConfig] = None,
  363. **kwargs) -> None:
  364. super().__init__()
  365. self.config = config
  366. self.model = ArcticModel(config, cache_config, quant_config)
  367. self.vocab_size = config.vocab_size
  368. self.lm_head = ParallelLMHead(
  369. self.vocab_size,
  370. config.hidden_size,
  371. quant_config=quant_config,
  372. )
  373. self.num_experts = config.num_local_experts
  374. self.num_experts_per_tok = config.num_experts_per_tok
  375. self.unpadded_vocab_size = config.vocab_size
  376. self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
  377. config.vocab_size)
  378. self.sampler = Sampler()
  379. def forward(
  380. self,
  381. input_ids: torch.Tensor,
  382. positions: torch.Tensor,
  383. kv_caches: List[torch.Tensor],
  384. attn_metadata: AttentionMetadata,
  385. intermediate_tensors: Optional[IntermediateTensors] = None,
  386. ) -> torch.Tensor:
  387. hidden_states = self.model(input_ids, positions, kv_caches,
  388. attn_metadata)
  389. return hidden_states
  390. def compute_logits(self, hidden_states: torch.Tensor,
  391. sampling_metadata: SamplingMetadata) -> torch.Tensor:
  392. logits = self.logits_processor(self.lm_head, hidden_states,
  393. sampling_metadata)
  394. return logits
  395. def sample(
  396. self,
  397. logits: Optional[torch.Tensor],
  398. sampling_metadata: SamplingMetadata,
  399. ) -> Optional[SamplerOutput]:
  400. next_tokens = self.sampler(logits, sampling_metadata)
  401. return next_tokens
  402. def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
  403. stacked_params_mapping = [
  404. # (param_name, shard_name, shard_id)
  405. ("qkv_proj", "q_proj", "q"),
  406. ("qkv_proj", "k_proj", "k"),
  407. ("qkv_proj", "v_proj", "v"),
  408. ]
  409. mlp_params_mapping = []
  410. expert_params_mapping = []
  411. num_layers = self.config.num_hidden_layers
  412. for layer in range(num_layers):
  413. mlp_params_mapping.append(
  414. (f"layers.{layer}.residual_mlp.w13.weight",
  415. f"layers.{layer}.residual_mlp.w1.weight", 0))
  416. mlp_params_mapping.append(
  417. (f"layers.{layer}.residual_mlp.w13.weight",
  418. f"layers.{layer}.residual_mlp.w3.weight", 1))
  419. if layer % 2 == 0:
  420. # MLP layers
  421. mlp_params_mapping.append(
  422. (f"layers.{layer}.block_sparse_moe.mlp.w13.weight",
  423. f"layers.{layer}.block_sparse_moe.mlp.w1.weight", 0))
  424. mlp_params_mapping.append(
  425. (f"layers.{layer}.block_sparse_moe.mlp.w13.weight",
  426. f"layers.{layer}.block_sparse_moe.mlp.w3.weight", 1))
  427. else:
  428. # MoE layers
  429. for expert_id in range(self.config.num_local_experts):
  430. expert_params_mapping.append(
  431. ("ws", f"experts.{expert_id}.w1.weight", expert_id))
  432. expert_params_mapping.append(
  433. ("w2s", f"experts.{expert_id}.w2.weight", expert_id))
  434. expert_params_mapping.append(
  435. ("ws", f"experts.{expert_id}.w3.weight", expert_id))
  436. params_dict = dict(self.named_parameters())
  437. logger.info(
  438. "It will take ~10 minutes loading from the 16-bit weights. "
  439. "Alternatively, use the prequantized 8-bit weights of arctic "
  440. "and set load-format to `sharded_state` will accelerate loading.")
  441. for name, loaded_weight in weights:
  442. for (param_name, weight_name, shard_id) in stacked_params_mapping:
  443. if weight_name not in name:
  444. continue
  445. name = name.replace(weight_name, param_name)
  446. # Skip loading extra bias for GPTQ models.
  447. if name.endswith(".bias") and name not in params_dict:
  448. continue
  449. param = params_dict[name]
  450. weight_loader = param.weight_loader
  451. weight_loader(param, loaded_weight, shard_id)
  452. break
  453. else:
  454. for param_name, weight_name, shard_id in mlp_params_mapping:
  455. if weight_name not in name:
  456. continue
  457. name = name.replace(weight_name, param_name)
  458. param = params_dict[name]
  459. weight_loader = param.weight_loader
  460. weight_loader(param, loaded_weight, shard_id)
  461. break
  462. else:
  463. for param_name, weight_name, shard_id \
  464. in expert_params_mapping:
  465. if weight_name not in name:
  466. continue
  467. name = name.replace(weight_name, param_name)
  468. param = params_dict[name]
  469. weight_loader = param.weight_loader
  470. weight_loader(param,
  471. loaded_weight,
  472. weight_name,
  473. expert_id=shard_id)
  474. break
  475. else:
  476. if name.endswith(".bias") and name not in params_dict:
  477. continue
  478. param = params_dict[name]
  479. weight_loader = getattr(param, "weight_loader",
  480. default_weight_loader)
  481. weight_loader(param, loaded_weight)