granite.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565
  1. # coding=utf-8
  2. # Adapted from
  3. # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
  4. # Copyright 2023 The PygmalionAI team.
  5. # Copyright 2023 The vLLM team.
  6. # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
  7. #
  8. # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
  9. # and OPT implementations in this library. It has been modified from its
  10. # original forms to accommodate minor architectural differences compared
  11. # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
  12. #
  13. # Licensed under the Apache License, Version 2.0 (the "License");
  14. # you may not use this file except in compliance with the License.
  15. # You may obtain a copy of the License at
  16. #
  17. # http://www.apache.org/licenses/LICENSE-2.0
  18. #
  19. # Unless required by applicable law or agreed to in writing, software
  20. # distributed under the License is distributed on an "AS IS" BASIS,
  21. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  22. # See the License for the specific language governing permissions and
  23. # limitations under the License.
  24. """Inference-only IBM Granite model compatible with HuggingFace weights."""
  25. from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
  26. import torch
  27. from torch import nn
  28. from transformers import GraniteConfig
  29. from aphrodite.attention import Attention, AttentionMetadata
  30. from aphrodite.common.config import CacheConfig, LoRAConfig
  31. from aphrodite.common.sequence import IntermediateTensors
  32. from aphrodite.common.utils import is_hip
  33. from aphrodite.distributed import (get_pp_group,
  34. get_tensor_model_parallel_rank,
  35. get_tensor_model_parallel_world_size)
  36. from aphrodite.modeling.layers.activation import SiluAndMul
  37. from aphrodite.modeling.layers.layernorm import RMSNorm
  38. from aphrodite.modeling.layers.linear import (MergedColumnParallelLinear,
  39. QKVParallelLinear,
  40. RowParallelLinear)
  41. from aphrodite.modeling.layers.logits_processor import LogitsProcessor
  42. from aphrodite.modeling.layers.rotary_embedding import get_rope
  43. from aphrodite.modeling.layers.sampler import Sampler, SamplerOutput
  44. from aphrodite.modeling.layers.vocab_parallel_embedding import (
  45. DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
  46. from aphrodite.modeling.model_loader.weight_utils import (
  47. default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name)
  48. from aphrodite.modeling.sampling_metadata import SamplingMetadata
  49. from aphrodite.quantization.base_config import QuantizationConfig
  50. from aphrodite.quantization.compressed_tensors.utils import (
  51. get_compressed_tensors_cache_scale)
  52. from .interfaces import SupportsLoRA
  53. from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers
  54. class GraniteMLP(nn.Module):
  55. def __init__(
  56. self,
  57. hidden_size: int,
  58. intermediate_size: int,
  59. hidden_act: str,
  60. quant_config: Optional[QuantizationConfig] = None,
  61. bias: bool = False,
  62. prefix: str = "",
  63. ) -> None:
  64. super().__init__()
  65. self.gate_up_proj = MergedColumnParallelLinear(
  66. input_size=hidden_size,
  67. output_sizes=[intermediate_size] * 2,
  68. bias=bias,
  69. quant_config=quant_config,
  70. prefix=f"{prefix}.gate_up_proj",
  71. )
  72. self.down_proj = RowParallelLinear(
  73. input_size=intermediate_size,
  74. output_size=hidden_size,
  75. bias=bias,
  76. quant_config=quant_config,
  77. prefix=f"{prefix}.down_proj",
  78. )
  79. if hidden_act != "silu":
  80. raise ValueError(
  81. f"Unsupported activation: {hidden_act}. "
  82. "Only silu is supported for now."
  83. )
  84. self.act_fn = SiluAndMul()
  85. def forward(self, x):
  86. gate_up, _ = self.gate_up_proj(x)
  87. x = self.act_fn(gate_up)
  88. x, _ = self.down_proj(x)
  89. return x
  90. class GraniteAttention(nn.Module):
  91. def __init__(
  92. self,
  93. config: GraniteConfig,
  94. hidden_size: int,
  95. num_heads: int,
  96. num_kv_heads: int,
  97. rope_theta: float = 10000,
  98. rope_scaling: Optional[Dict[str, Any]] = None,
  99. max_position_embeddings: int = 8192,
  100. quant_config: Optional[QuantizationConfig] = None,
  101. bias: bool = False,
  102. cache_config: Optional[CacheConfig] = None,
  103. prefix: str = "",
  104. ) -> None:
  105. super().__init__()
  106. self.hidden_size = hidden_size
  107. tp_size = get_tensor_model_parallel_world_size()
  108. self.total_num_heads = num_heads
  109. assert self.total_num_heads % tp_size == 0
  110. self.num_heads = self.total_num_heads // tp_size
  111. self.total_num_kv_heads = num_kv_heads
  112. if self.total_num_kv_heads >= tp_size:
  113. # Number of KV heads is greater than TP size, so we partition
  114. # the KV heads across multiple tensor parallel GPUs.
  115. assert self.total_num_kv_heads % tp_size == 0
  116. else:
  117. # Number of KV heads is less than TP size, so we replicate
  118. # the KV heads across multiple tensor parallel GPUs.
  119. assert tp_size % self.total_num_kv_heads == 0
  120. self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
  121. # MistralConfig has an optional head_dim introduced by Mistral-Nemo
  122. self.head_dim = getattr(
  123. config, "head_dim", self.hidden_size // self.total_num_heads
  124. )
  125. self.q_size = self.num_heads * self.head_dim
  126. self.kv_size = self.num_kv_heads * self.head_dim
  127. self.scaling = config.attention_multiplier
  128. self.rope_theta = rope_theta
  129. self.max_position_embeddings = max_position_embeddings
  130. self.qkv_proj = QKVParallelLinear(
  131. hidden_size=hidden_size,
  132. head_size=self.head_dim,
  133. total_num_heads=self.total_num_heads,
  134. total_num_kv_heads=self.total_num_kv_heads,
  135. bias=bias,
  136. quant_config=quant_config,
  137. prefix=f"{prefix}.qkv_proj",
  138. )
  139. self.o_proj = RowParallelLinear(
  140. input_size=self.total_num_heads * self.head_dim,
  141. output_size=hidden_size,
  142. bias=bias,
  143. quant_config=quant_config,
  144. prefix=f"{prefix}.o_proj",
  145. )
  146. self.rotary_emb = get_rope(
  147. self.head_dim,
  148. rotary_dim=self.head_dim,
  149. max_position=max_position_embeddings,
  150. base=rope_theta,
  151. rope_scaling=rope_scaling,
  152. )
  153. self.attn = Attention(
  154. self.num_heads,
  155. self.head_dim,
  156. self.scaling,
  157. num_kv_heads=self.num_kv_heads,
  158. cache_config=cache_config,
  159. quant_config=quant_config,
  160. )
  161. def forward(
  162. self,
  163. positions: torch.Tensor,
  164. hidden_states: torch.Tensor,
  165. kv_cache: torch.Tensor,
  166. attn_metadata: AttentionMetadata,
  167. ) -> torch.Tensor:
  168. qkv, _ = self.qkv_proj(hidden_states)
  169. q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
  170. q, k = self.rotary_emb(positions, q, k)
  171. attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
  172. output, _ = self.o_proj(attn_output)
  173. return output
  174. class GraniteDecoderLayer(nn.Module):
  175. def __init__(
  176. self,
  177. config: GraniteConfig,
  178. cache_config: Optional[CacheConfig] = None,
  179. quant_config: Optional[QuantizationConfig] = None,
  180. prefix: str = "",
  181. ) -> None:
  182. super().__init__()
  183. self.hidden_size = config.hidden_size
  184. self.residual_multiplier = config.residual_multiplier
  185. rope_theta = getattr(config, "rope_theta", 10000)
  186. rope_scaling = getattr(config, "rope_scaling", None)
  187. if rope_scaling is not None and getattr(
  188. config, "original_max_position_embeddings", None
  189. ):
  190. rope_scaling[
  191. "original_max_position_embeddings"
  192. ] = config.original_max_position_embeddings
  193. max_position_embeddings = getattr(
  194. config, "max_position_embeddings", 8192
  195. )
  196. # Support abacusai/Smaug-72B-v0.1 with attention_bias
  197. # Support internlm/internlm-7b with bias
  198. attention_bias = getattr(config, "attention_bias", False) or getattr(
  199. config, "bias", False
  200. )
  201. self.self_attn = GraniteAttention(
  202. config=config,
  203. hidden_size=self.hidden_size,
  204. num_heads=config.num_attention_heads,
  205. num_kv_heads=getattr(
  206. config, "num_key_value_heads", config.num_attention_heads
  207. ),
  208. rope_theta=rope_theta,
  209. rope_scaling=rope_scaling,
  210. max_position_embeddings=max_position_embeddings,
  211. quant_config=quant_config,
  212. bias=attention_bias,
  213. cache_config=cache_config,
  214. prefix=f"{prefix}.self_attn",
  215. )
  216. self.mlp = GraniteMLP(
  217. hidden_size=self.hidden_size,
  218. intermediate_size=config.intermediate_size,
  219. hidden_act=config.hidden_act,
  220. quant_config=quant_config,
  221. bias=getattr(config, "mlp_bias", False),
  222. prefix=f"{prefix}.mlp",
  223. )
  224. self.input_layernorm = RMSNorm(
  225. config.hidden_size, eps=config.rms_norm_eps
  226. )
  227. self.post_attention_layernorm = RMSNorm(
  228. config.hidden_size, eps=config.rms_norm_eps
  229. )
  230. def forward(
  231. self,
  232. positions: torch.Tensor,
  233. hidden_states: torch.Tensor,
  234. kv_cache: torch.Tensor,
  235. attn_metadata: AttentionMetadata,
  236. ) -> Tuple[torch.Tensor, torch.Tensor]:
  237. # Self Attention
  238. residual = hidden_states
  239. hidden_states = self.input_layernorm(hidden_states)
  240. hidden_states = self.self_attn(
  241. positions=positions,
  242. hidden_states=hidden_states,
  243. kv_cache=kv_cache,
  244. attn_metadata=attn_metadata,
  245. )
  246. hidden_states = residual + hidden_states * self.residual_multiplier
  247. # Fully Connected
  248. residual = hidden_states
  249. hidden_states = self.post_attention_layernorm(hidden_states)
  250. hidden_states = self.mlp(hidden_states)
  251. hidden_states = residual + hidden_states * self.residual_multiplier
  252. return hidden_states
  253. class GraniteModel(nn.Module):
  254. def __init__(
  255. self,
  256. config: GraniteConfig,
  257. cache_config: Optional[CacheConfig] = None,
  258. quant_config: Optional[QuantizationConfig] = None,
  259. lora_config: Optional[LoRAConfig] = None,
  260. prefix: str = "",
  261. ) -> None:
  262. super().__init__()
  263. self.config = config
  264. self.padding_idx = config.pad_token_id
  265. lora_vocab = (
  266. (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
  267. if lora_config
  268. else 0
  269. )
  270. self.vocab_size = config.vocab_size + lora_vocab
  271. self.org_vocab_size = config.vocab_size
  272. if get_pp_group().is_first_rank or (
  273. config.tie_word_embeddings and get_pp_group().is_last_rank
  274. ):
  275. self.embed_tokens = VocabParallelEmbedding(
  276. self.vocab_size,
  277. config.hidden_size,
  278. org_num_embeddings=config.vocab_size,
  279. quant_config=quant_config,
  280. )
  281. else:
  282. self.embed_tokens = PPMissingLayer()
  283. self.start_layer, self.end_layer, self.layers = make_layers(
  284. config.num_hidden_layers,
  285. lambda prefix: GraniteDecoderLayer(
  286. config=config,
  287. cache_config=cache_config,
  288. quant_config=quant_config,
  289. prefix=prefix,
  290. ),
  291. prefix=f"{prefix}.layers",
  292. )
  293. if get_pp_group().is_last_rank:
  294. self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
  295. else:
  296. self.norm = PPMissingLayer()
  297. def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
  298. return self.embed_tokens(input_ids)
  299. def forward(
  300. self,
  301. input_ids: Optional[torch.Tensor],
  302. positions: torch.Tensor,
  303. kv_caches: List[torch.Tensor],
  304. attn_metadata: AttentionMetadata,
  305. intermediate_tensors: Optional[IntermediateTensors],
  306. inputs_embeds: Optional[torch.Tensor] = None,
  307. ) -> Union[torch.Tensor, IntermediateTensors]:
  308. if get_pp_group().is_first_rank:
  309. if inputs_embeds is not None:
  310. hidden_states = inputs_embeds
  311. else:
  312. hidden_states = self.get_input_embeddings(input_ids)
  313. residual = None
  314. else:
  315. assert intermediate_tensors is not None
  316. hidden_states = intermediate_tensors["hidden_states"]
  317. residual = intermediate_tensors["residual"]
  318. hidden_states *= self.config.embedding_multiplier
  319. for i in range(self.start_layer, self.end_layer):
  320. layer = self.layers[i]
  321. hidden_states = layer(
  322. positions,
  323. hidden_states,
  324. kv_caches[i - self.start_layer],
  325. attn_metadata,
  326. )
  327. if not get_pp_group().is_last_rank:
  328. return IntermediateTensors(
  329. {"hidden_states": hidden_states, "residual": residual}
  330. )
  331. hidden_states = self.norm(hidden_states)
  332. return hidden_states
  333. class GraniteForCausalLM(nn.Module, SupportsLoRA):
  334. packed_modules_mapping = {
  335. "qkv_proj": [
  336. "q_proj",
  337. "k_proj",
  338. "v_proj",
  339. ],
  340. "gate_up_proj": [
  341. "gate_proj",
  342. "up_proj",
  343. ],
  344. }
  345. # LoRA specific attributes
  346. supported_lora_modules = [
  347. "qkv_proj",
  348. "o_proj",
  349. "gate_up_proj",
  350. "down_proj",
  351. "embed_tokens",
  352. "lm_head",
  353. ]
  354. embedding_modules = {
  355. "embed_tokens": "input_embeddings",
  356. "lm_head": "output_embeddings",
  357. }
  358. embedding_padding_modules = ["lm_head"]
  359. bitsandbytes_stacked_params_mapping = {
  360. # shard_name, weight_name, index
  361. "q_proj": ("qkv_proj", 0),
  362. "k_proj": ("qkv_proj", 1),
  363. "v_proj": ("qkv_proj", 2),
  364. "gate_proj": ("gate_up_proj", 0),
  365. "up_proj": ("gate_up_proj", 1),
  366. }
  367. def __init__(
  368. self,
  369. config: GraniteConfig,
  370. cache_config: Optional[CacheConfig] = None,
  371. quant_config: Optional[QuantizationConfig] = None,
  372. lora_config: Optional[LoRAConfig] = None,
  373. ) -> None:
  374. super().__init__()
  375. self.config = config
  376. self.lora_config = lora_config
  377. self.model = GraniteModel(
  378. config,
  379. cache_config,
  380. quant_config,
  381. lora_config=lora_config,
  382. prefix="model",
  383. )
  384. if get_pp_group().is_last_rank:
  385. self.unpadded_vocab_size = config.vocab_size
  386. if lora_config:
  387. self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
  388. self.lm_head = ParallelLMHead(
  389. self.unpadded_vocab_size,
  390. config.hidden_size,
  391. org_num_embeddings=config.vocab_size,
  392. padding_size=DEFAULT_VOCAB_PADDING_SIZE
  393. # We need bigger padding if using lora for kernel
  394. # compatibility
  395. if not lora_config
  396. else lora_config.lora_vocab_padding_size,
  397. quant_config=quant_config,
  398. )
  399. if config.tie_word_embeddings:
  400. self.lm_head.weight = self.model.embed_tokens.weight
  401. logit_scale = getattr(config, "logit_scale", 1.0)
  402. self.logits_processor = LogitsProcessor(
  403. self.unpadded_vocab_size, config.vocab_size, logit_scale
  404. )
  405. self.sampler = Sampler()
  406. else:
  407. self.lm_head = PPMissingLayer()
  408. def forward(
  409. self,
  410. input_ids: torch.Tensor,
  411. positions: torch.Tensor,
  412. kv_caches: List[torch.Tensor],
  413. attn_metadata: AttentionMetadata,
  414. intermediate_tensors: Optional[IntermediateTensors] = None,
  415. ) -> Union[torch.Tensor, IntermediateTensors]:
  416. model_output = self.model(
  417. input_ids, positions, kv_caches, attn_metadata, intermediate_tensors
  418. )
  419. return model_output
  420. def compute_logits(
  421. self, hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata
  422. ) -> Optional[torch.Tensor]:
  423. logits = self.logits_processor(
  424. self.lm_head, hidden_states, sampling_metadata
  425. )
  426. return logits
  427. def sample(
  428. self,
  429. logits: torch.Tensor,
  430. sampling_metadata: SamplingMetadata,
  431. ) -> Optional[SamplerOutput]:
  432. next_tokens = self.sampler(logits, sampling_metadata)
  433. return next_tokens
  434. def make_empty_intermediate_tensors(
  435. self, batch_size: int, dtype: torch.dtype, device: torch.device
  436. ) -> IntermediateTensors:
  437. return IntermediateTensors(
  438. {
  439. "hidden_states": torch.zeros(
  440. (batch_size, self.config.hidden_size),
  441. dtype=dtype,
  442. device=device,
  443. ),
  444. "residual": torch.zeros(
  445. (batch_size, self.config.hidden_size),
  446. dtype=dtype,
  447. device=device,
  448. ),
  449. }
  450. )
  451. def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
  452. stacked_params_mapping = [
  453. # (param_name, shard_name, shard_id)
  454. (".qkv_proj", ".q_proj", "q"),
  455. (".qkv_proj", ".k_proj", "k"),
  456. (".qkv_proj", ".v_proj", "v"),
  457. (".gate_up_proj", ".gate_proj", 0),
  458. (".gate_up_proj", ".up_proj", 1),
  459. ]
  460. params_dict = dict(self.named_parameters())
  461. for name, loaded_weight in weights:
  462. if "rotary_emb.inv_freq" in name:
  463. continue
  464. if (
  465. "rotary_emb.cos_cached" in name
  466. or "rotary_emb.sin_cached" in name
  467. ):
  468. # Models trained using ColossalAI may include these tensors in
  469. # the checkpoint. Skip them.
  470. continue
  471. # With tie_word_embeddings, we can skip lm_head.weight
  472. # The weight might appear unnecessarily in the files if the model is
  473. # processed with quantization, LoRA, fine-tuning, etc.
  474. if self.config.tie_word_embeddings and "lm_head.weight" in name:
  475. continue
  476. if scale_name := get_compressed_tensors_cache_scale(name):
  477. # Loading kv cache scales for compressed-tensors quantization
  478. param = params_dict[scale_name]
  479. weight_loader = getattr(
  480. param, "weight_loader", default_weight_loader
  481. )
  482. loaded_weight = loaded_weight[0]
  483. weight_loader(param, loaded_weight)
  484. continue
  485. for param_name, weight_name, shard_id in stacked_params_mapping:
  486. if weight_name not in name:
  487. continue
  488. name = name.replace(weight_name, param_name)
  489. # Skip loading extra bias for GPTQ models.
  490. if name.endswith(".bias") and name not in params_dict:
  491. continue
  492. if is_pp_missing_parameter(name, self):
  493. continue
  494. param = params_dict[name]
  495. weight_loader = param.weight_loader
  496. weight_loader(param, loaded_weight, shard_id)
  497. break
  498. else:
  499. # Skip loading extra bias for GPTQ models.
  500. if name.endswith(".bias") and name not in params_dict:
  501. continue
  502. # Remapping the name of FP8 kv-scale.
  503. name = maybe_remap_kv_scale_name(name, params_dict)
  504. if name is None:
  505. continue
  506. if is_pp_missing_parameter(name, self):
  507. continue
  508. param = params_dict[name]
  509. weight_loader = getattr(
  510. param, "weight_loader", default_weight_loader
  511. )
  512. weight_loader(param, loaded_weight)
  513. # If this function is called, it should always initialize KV cache scale
  514. # factors (or else raise an exception). Thus, handled exceptions should
  515. # make sure to leave KV cache scale factors in a known good (dummy) state
  516. def load_kv_cache_scales(self, quantization_param_path: str) -> None:
  517. tp_size = get_tensor_model_parallel_world_size()
  518. tp_rank = get_tensor_model_parallel_rank()
  519. for layer_idx, scaling_factor in kv_cache_scales_loader(
  520. quantization_param_path,
  521. tp_rank,
  522. tp_size,
  523. self.config.num_hidden_layers,
  524. self.config.__class__.model_type,
  525. ):
  526. if not isinstance(self.model.layers[layer_idx], nn.Identity):
  527. layer_self_attn = self.model.layers[layer_idx].self_attn
  528. if is_hip():
  529. # The scaling factor convention we are assuming is
  530. # quantized_value * scaling_factor ~= true_value
  531. # which is consistent with the practice of setting
  532. # scaling_factor = tensor_amax / FPtype_max
  533. scaling_factor *= 2
  534. if hasattr(layer_self_attn, "kv_scale"):
  535. layer_self_attn.attn._kv_scale = scaling_factor
  536. else:
  537. raise RuntimeError(
  538. "Self attention has no KV cache scaling "
  539. "factor attribute!"
  540. )