1
0

stablelm.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313
  1. # coding=utf-8
  2. # Copyright 2023 Stability AI, EleutherAI, and The HuggingFace Inc. team.
  3. # All rights reserved.
  4. #
  5. # Licensed under the Apache License, Version 2.0 (the "License");
  6. # you may not use this file except in compliance with the License.
  7. # You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS IS" BASIS,
  13. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. # See the License for the specific language governing permissions and
  15. # limitations under the License.
  16. #
  17. # This code is based off the following work:
  18. # https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/modeling_stablelm_epoch.py
  19. # https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/config.json
  20. """Inference-only StabeLM (https://github.com/Stability-AI/StableLM)
  21. model compatible with HuggingFace weights."""
  22. from typing import Iterable, List, Optional, Tuple
  23. import torch
  24. from torch import nn
  25. from transformers import PretrainedConfig
  26. from aphrodite.attention import Attention, AttentionMetadata
  27. from aphrodite.common.config import CacheConfig
  28. from aphrodite.common.sequence import IntermediateTensors, SamplerOutput
  29. from aphrodite.distributed import get_tensor_model_parallel_world_size
  30. from aphrodite.modeling.layers.activation import SiluAndMul
  31. from aphrodite.modeling.layers.linear import (MergedColumnParallelLinear,
  32. QKVParallelLinear,
  33. RowParallelLinear)
  34. from aphrodite.modeling.layers.logits_processor import LogitsProcessor
  35. from aphrodite.modeling.layers.rotary_embedding import get_rope
  36. from aphrodite.modeling.layers.sampler import Sampler
  37. from aphrodite.modeling.layers.vocab_parallel_embedding import (
  38. ParallelLMHead, VocabParallelEmbedding)
  39. from aphrodite.modeling.model_loader.weight_utils import default_weight_loader
  40. from aphrodite.modeling.sampling_metadata import SamplingMetadata
  41. from aphrodite.quantization.base_config import QuantizationConfig
  42. class StablelmMLP(nn.Module):
  43. def __init__(self,
  44. config: PretrainedConfig,
  45. quant_config: Optional[QuantizationConfig] = None) -> None:
  46. super().__init__()
  47. self.config = config
  48. self.hidden_size = config.hidden_size
  49. self.intermediate_size = config.intermediate_size
  50. self.gate_up_proj = MergedColumnParallelLinear(
  51. config.hidden_size, [config.intermediate_size] * 2,
  52. bias=False,
  53. quant_config=quant_config)
  54. self.down_proj = RowParallelLinear(config.intermediate_size,
  55. config.hidden_size,
  56. bias=False)
  57. self.act_fn = SiluAndMul()
  58. def forward(self, x: torch.Tensor) -> torch.Tensor:
  59. gate_up, _ = self.gate_up_proj(x)
  60. x = self.act_fn(gate_up)
  61. x, _ = self.down_proj(x)
  62. return x
  63. class StablelmAttention(nn.Module):
  64. def __init__(self,
  65. config: PretrainedConfig,
  66. cache_config: Optional[CacheConfig] = None,
  67. quant_config: Optional[QuantizationConfig] = None) -> None:
  68. super().__init__()
  69. self.config = config
  70. self.hidden_size = config.hidden_size
  71. tp_size = get_tensor_model_parallel_world_size()
  72. self.total_num_heads = config.num_attention_heads
  73. self.num_heads = self.total_num_heads // tp_size
  74. self.total_num_key_value_heads = config.num_key_value_heads
  75. if self.total_num_key_value_heads >= tp_size:
  76. # Number of KV heads is greater than TP size, so we partition
  77. # the KV heads across multiple tensor parallel GPUs.
  78. assert self.total_num_key_value_heads % tp_size == 0
  79. else:
  80. # Number of KV heads is less than TP size, so we replicate
  81. # the KV heads across multiple tensor parallel GPUs.
  82. assert tp_size % self.total_num_key_value_heads == 0
  83. self.num_key_value_heads = max(
  84. 1, self.total_num_key_value_heads // tp_size)
  85. self.head_dim = self.hidden_size // self.total_num_heads
  86. self.max_position_embeddings = config.max_position_embeddings
  87. rope_pct = getattr(config, "rope_pct",
  88. getattr(config, "partial_rotary_factor", 1))
  89. self.rotary_ndims = int(self.head_dim * rope_pct)
  90. self.scaling = self.head_dim**-0.5
  91. self.q_size = self.num_heads * self.head_dim
  92. self.kv_size = self.num_key_value_heads * self.head_dim
  93. self.qkv_bias = getattr(config, "use_qkv_bias", False)
  94. if (self.head_dim * self.num_heads * tp_size) != self.hidden_size:
  95. raise ValueError(f"hidden_size must be divisible by num_heads "
  96. f"(got `hidden_size`: {self.hidden_size}"
  97. f" and `num_heads`: {self.num_heads}).")
  98. self.qkv_proj = QKVParallelLinear(self.hidden_size,
  99. self.head_dim,
  100. self.total_num_heads,
  101. self.total_num_key_value_heads,
  102. self.qkv_bias,
  103. quant_config=quant_config)
  104. self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim,
  105. self.hidden_size,
  106. bias=False,
  107. quant_config=quant_config)
  108. self.rotary_emb = get_rope(
  109. self.head_dim,
  110. rotary_dim=self.rotary_ndims,
  111. max_position=self.config.max_position_embeddings,
  112. base=self.config.rope_theta,
  113. )
  114. self.attn = Attention(self.num_heads,
  115. self.head_dim,
  116. self.scaling,
  117. num_kv_heads=self.num_key_value_heads,
  118. cache_config=cache_config,
  119. quant_config=quant_config)
  120. def forward(
  121. self,
  122. positions: torch.Tensor,
  123. hidden_states: torch.Tensor,
  124. kv_cache: torch.Tensor,
  125. attn_metadata: AttentionMetadata,
  126. ) -> torch.Tensor:
  127. qkv, _ = self.qkv_proj(hidden_states)
  128. q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
  129. q, k = self.rotary_emb(positions, q, k)
  130. attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
  131. output, _ = self.o_proj(attn_output)
  132. return output
  133. class StablelmDecoderLayer(nn.Module):
  134. def __init__(
  135. self,
  136. config: PretrainedConfig,
  137. cache_config: Optional[CacheConfig] = None,
  138. quant_config: Optional[QuantizationConfig] = None,
  139. ) -> None:
  140. super().__init__()
  141. self.self_attn = StablelmAttention(config, cache_config, quant_config)
  142. self.mlp = StablelmMLP(config, quant_config)
  143. norm_eps = getattr(config, "norm_eps",
  144. getattr(config, "layer_norm_eps", 1e-05))
  145. self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=norm_eps)
  146. self.post_attention_layernorm = nn.LayerNorm(config.hidden_size,
  147. eps=norm_eps)
  148. def forward(
  149. self,
  150. positions: torch.Tensor,
  151. hidden_states: torch.Tensor,
  152. kv_cache: torch.Tensor,
  153. attn_metadata: AttentionMetadata,
  154. ) -> Tuple[torch.Tensor, torch.Tensor]:
  155. # Self Attention
  156. residual = hidden_states
  157. hidden_states = self.input_layernorm(hidden_states)
  158. hidden_states = self.self_attn(
  159. positions=positions,
  160. hidden_states=hidden_states,
  161. kv_cache=kv_cache,
  162. attn_metadata=attn_metadata,
  163. )
  164. hidden_states = residual + hidden_states
  165. # Fully Connected
  166. residual = hidden_states
  167. hidden_states = self.post_attention_layernorm(hidden_states)
  168. hidden_states = self.mlp(hidden_states)
  169. hidden_states = residual + hidden_states
  170. return hidden_states, residual
  171. class StableLMEpochModel(nn.Module):
  172. def __init__(self,
  173. config: PretrainedConfig,
  174. cache_config: Optional[CacheConfig] = None,
  175. quant_config: Optional[QuantizationConfig] = None) -> None:
  176. super().__init__()
  177. self.embed_tokens = VocabParallelEmbedding(
  178. config.vocab_size,
  179. config.hidden_size,
  180. )
  181. self.layers = nn.ModuleList([
  182. StablelmDecoderLayer(config, cache_config, quant_config)
  183. for _ in range(config.num_hidden_layers)
  184. ])
  185. norm_eps = getattr(config, "norm_eps",
  186. getattr(config, "layer_norm_eps", 1e-05))
  187. self.norm = nn.LayerNorm(config.hidden_size, eps=norm_eps)
  188. def forward(
  189. self,
  190. input_ids: torch.Tensor,
  191. positions: torch.Tensor,
  192. kv_caches: List[torch.Tensor],
  193. attn_metadata: AttentionMetadata,
  194. ) -> torch.Tensor:
  195. hidden_states = self.embed_tokens(input_ids)
  196. for i in range(len(self.layers)):
  197. layer = self.layers[i]
  198. hidden_states, residual = layer(
  199. positions,
  200. hidden_states,
  201. kv_caches[i],
  202. attn_metadata,
  203. )
  204. hidden_states = self.norm(hidden_states)
  205. return hidden_states
  206. class StablelmForCausalLM(nn.Module):
  207. def __init__(
  208. self,
  209. config: PretrainedConfig,
  210. cache_config: Optional[CacheConfig] = None,
  211. quant_config: Optional[QuantizationConfig] = None,
  212. ) -> None:
  213. super().__init__()
  214. self.config = config
  215. self.quant_config = quant_config
  216. self.model = StableLMEpochModel(config, cache_config, quant_config)
  217. self.lm_head = ParallelLMHead(config.vocab_size,
  218. config.hidden_size,
  219. quant_config=quant_config)
  220. self.logits_processor = LogitsProcessor(config.vocab_size)
  221. self.sampler = Sampler()
  222. def forward(
  223. self,
  224. input_ids: torch.Tensor,
  225. positions: torch.Tensor,
  226. kv_caches: List[torch.Tensor],
  227. attn_metadata: AttentionMetadata,
  228. intermediate_tensors: Optional[IntermediateTensors] = None,
  229. ) -> torch.Tensor:
  230. hidden_states = self.model(input_ids, positions, kv_caches,
  231. attn_metadata)
  232. return hidden_states
  233. def compute_logits(
  234. self,
  235. hidden_states: torch.Tensor,
  236. sampling_metadata: SamplingMetadata,
  237. ) -> Optional[torch.Tensor]:
  238. logits = self.logits_processor(self.lm_head, hidden_states,
  239. sampling_metadata)
  240. return logits
  241. def sample(
  242. self,
  243. logits: torch.Tensor,
  244. sampling_metadata: SamplingMetadata,
  245. ) -> Optional[SamplerOutput]:
  246. next_tokens = self.sampler(logits, sampling_metadata)
  247. return next_tokens
  248. def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
  249. stacked_params_mapping = [
  250. # (param_name, shard_name, shard_id)
  251. ("qkv_proj", "q_proj", "q"),
  252. ("qkv_proj", "k_proj", "k"),
  253. ("qkv_proj", "v_proj", "v"),
  254. ("gate_up_proj", "gate_proj", 0),
  255. ("gate_up_proj", "up_proj", 1),
  256. ]
  257. params_dict = dict(self.named_parameters())
  258. for name, loaded_weight in weights:
  259. if "rotary_emb.inv_freq" in name:
  260. continue
  261. if ("rotary_emb.cos_cached" in name
  262. or "rotary_emb.sin_cached" in name):
  263. # Models trained using ColossalAI may include these tensors in
  264. # the checkpoint. Skip them.
  265. continue
  266. for (param_name, weight_name, shard_id) in stacked_params_mapping:
  267. if weight_name not in name:
  268. continue
  269. name = name.replace(weight_name, param_name)
  270. # Skip loading extra bias for GPTQ models.
  271. if name.endswith(".bias") and name not in params_dict:
  272. continue
  273. param = params_dict[name]
  274. weight_loader = param.weight_loader
  275. weight_loader(param, loaded_weight, shard_id)
  276. break
  277. else:
  278. # Skip loading extra bias for GPTQ models.
  279. if name.endswith(".bias") and name not in params_dict:
  280. continue
  281. param = params_dict[name]
  282. weight_loader = getattr(param, "weight_loader",
  283. default_weight_loader)
  284. weight_loader(param, loaded_weight)