1
0

layers.py 41 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134
  1. # pylint: disable=unused-argument
  2. import inspect
  3. import math
  4. from dataclasses import dataclass
  5. from typing import TYPE_CHECKING, List, Optional, Set, Tuple, Type
  6. import torch
  7. import torch.nn as nn
  8. import torch.nn.functional as F
  9. from transformers import PretrainedConfig
  10. from aphrodite.common.config import LoRAConfig
  11. from aphrodite.distributed import (get_tensor_model_parallel_rank,
  12. get_tensor_model_parallel_world_size,
  13. split_tensor_along_last_dim,
  14. tensor_model_parallel_all_gather,
  15. tensor_model_parallel_all_reduce,
  16. tensor_model_parallel_gather)
  17. from aphrodite.lora.punica import add_lora, add_lora_slice, bgmv
  18. from aphrodite.modeling.layers.linear import (ColumnParallelLinear,
  19. MergedColumnParallelLinear,
  20. QKVParallelLinear,
  21. RowParallelLinear)
  22. from aphrodite.modeling.layers.logits_processor import LogitsProcessor
  23. from aphrodite.modeling.layers.vocab_parallel_embedding import (
  24. ParallelLMHead, VocabParallelEmbedding)
  25. if TYPE_CHECKING:
  26. pass
  27. def _get_lora_device(base_layer: nn.Module) -> torch.device:
  28. """Identify the device for positioning the LoRA tensors."""
  29. device = None
  30. try:
  31. device = base_layer.weight.device
  32. except AttributeError:
  33. try:
  34. linear_weights = base_layer.linear_weights
  35. if isinstance(linear_weights, dict):
  36. tensor_values = [
  37. v for v in linear_weights.values()
  38. if isinstance(v, torch.Tensor)
  39. ]
  40. if tensor_values:
  41. device = tensor_values[0].device
  42. except AttributeError:
  43. pass
  44. if device is None:
  45. raise ValueError(f"Base layer not supported: {base_layer}")
  46. return device
  47. def _apply_lora(
  48. x: torch.Tensor,
  49. lora_a_stacked: torch.Tensor,
  50. lora_b_stacked: torch.Tensor,
  51. indices: torch.Tensor,
  52. output: torch.Tensor,
  53. ):
  54. """Applies lora to each input.
  55. This method applies all loras to each input. It uses the
  56. indices vector to determine which lora yields the
  57. correct output. An index of -1 means no lora should be
  58. applied. This method adds the final lora results to the
  59. output.
  60. Input shapes:
  61. x: (batch_size, hidden_dim)
  62. lora_a_stacked: (num_loras, lora_rank, hidden_dim)
  63. lora_b_stacked: (num_loras, output_dim, lora_rank)
  64. indices: (batch_size)
  65. output: (batch_size, output_dim)
  66. """
  67. org_output = output
  68. x = x.view(-1, x.shape[-1])
  69. output = output.view(-1, output.shape[-1])
  70. indices = indices.view(-1)
  71. add_lora(output, x, lora_a_stacked, lora_b_stacked, indices, 0, 1.0)
  72. return output.view_as(org_output)
  73. def _apply_lora_packed_nslice(
  74. x: torch.Tensor,
  75. lora_a_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
  76. lora_b_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
  77. indices: torch.Tensor,
  78. output: torch.Tensor,
  79. output_slices: Tuple[int, ...],
  80. ):
  81. """Applies lora to each input.
  82. This method applies all loras to each input. It uses the
  83. indices vector to determine which lora yields the
  84. correct output. An index of -1 means no lora should be
  85. applied. This method adds the final lora results to the
  86. output.
  87. This method is used for layers that are composed of multiple sublayers
  88. (slices) packed together.
  89. Input shapes:
  90. x: (batch_size, hidden_dim)
  91. lora_a_stacked: 3 element tuple of (num_loras, lora_rank, hidden_dim)
  92. lora_b_stacked: 3 element tuple of (num_loras, output_dim, lora_rank)
  93. indices: (batch_size)
  94. output: (batch_size, q_slice_size + 2*kv_slice_size)
  95. output_slices: n-1 element tuple of (slice_size...),
  96. where n is number of slices
  97. """
  98. org_output = output
  99. x = x.view(-1, x.shape[-1])
  100. output = output.view(-1, output.shape[-1])
  101. indices = indices.view(-1)
  102. offset_left = 0
  103. for slice_idx in range(len(output_slices)):
  104. add_lora_slice(output, x, lora_a_stacked[slice_idx],
  105. lora_b_stacked[slice_idx], indices, 0, 1.0, offset_left,
  106. output_slices[slice_idx])
  107. offset_left += output_slices[slice_idx]
  108. return output.view_as(org_output)
  109. @dataclass
  110. class LoRAMapping:
  111. # Per every token in input_ids:
  112. index_mapping: Tuple[int, ...]
  113. # Per sampled token:
  114. prompt_mapping: Tuple[int, ...]
  115. def __post_init__(self):
  116. self.index_mapping = tuple(self.index_mapping)
  117. self.prompt_mapping = tuple(self.prompt_mapping)
  118. class BaseLayerWithLoRA(nn.Module):
  119. def create_lora_weights(
  120. self,
  121. max_loras: int,
  122. lora_config: LoRAConfig,
  123. model_config: Optional[PretrainedConfig] = None) -> None:
  124. """Initializes lora matrices."""
  125. ...
  126. def reset_lora(self, index: int):
  127. """Resets the lora weights at index back to 0."""
  128. ...
  129. def set_lora(
  130. self,
  131. index: int,
  132. lora_a: torch.Tensor,
  133. lora_b: torch.Tensor,
  134. embeddings_tensor: Optional[torch.Tensor],
  135. ):
  136. """Overwrites lora tensors at index."""
  137. ...
  138. def set_mapping(
  139. self,
  140. base_indices: torch.Tensor,
  141. sampler_indices: torch.Tensor,
  142. sampler_indices_padded: torch.Tensor,
  143. embeddings_indices: torch.Tensor,
  144. indices_len: List[int],
  145. ):
  146. """Sets the mapping indices."""
  147. ...
  148. @classmethod
  149. def can_replace_layer(cls, source_layer: nn.Module,
  150. lora_config: LoRAConfig, packed_modules_list: List,
  151. model_config: Optional[PretrainedConfig]) -> bool:
  152. """Returns True if the layer can be replaced by this LoRA layer."""
  153. raise NotImplementedError
  154. class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):
  155. def __init__(self, base_layer: VocabParallelEmbedding) -> None:
  156. super().__init__()
  157. self.base_layer = base_layer
  158. def create_lora_weights(
  159. self,
  160. max_loras: int,
  161. lora_config: LoRAConfig,
  162. model_config: Optional[PretrainedConfig] = None) -> None:
  163. lora_vocab_start_idx = self.base_layer.org_vocab_size
  164. weights_idx = None
  165. if self.base_layer.vocab_end_index > lora_vocab_start_idx:
  166. # We can start adding lora weights
  167. weights_idx = max(
  168. lora_vocab_start_idx - self.base_layer.vocab_start_index, 0)
  169. self.embeddings_slice = (self.base_layer.vocab_start_index -
  170. self.base_layer.org_vocab_size +
  171. weights_idx,
  172. self.base_layer.vocab_end_index -
  173. self.base_layer.org_vocab_size)
  174. self.embeddings_weights = self.base_layer.weight.data[weights_idx:]
  175. self.embeddings_weights.fill_(0)
  176. else:
  177. self.embeddings_slice = None
  178. self.embeddings_weights = None
  179. self.embeddings_tensors = torch.zeros(
  180. (
  181. max_loras,
  182. lora_config.lora_extra_vocab_size,
  183. self.base_layer.embedding_dim,
  184. ),
  185. dtype=self.base_layer.weight.dtype,
  186. device=self.base_layer.weight.device,
  187. )
  188. self.lora_a_stacked = torch.zeros(
  189. (
  190. max_loras,
  191. self.base_layer.org_vocab_size +
  192. lora_config.lora_extra_vocab_size,
  193. lora_config.max_lora_rank,
  194. ),
  195. dtype=lora_config.lora_dtype,
  196. device=self.base_layer.weight.device,
  197. )
  198. self.lora_b_stacked = torch.zeros(
  199. (
  200. max_loras,
  201. 1,
  202. self.base_layer.embedding_dim,
  203. lora_config.max_lora_rank,
  204. ),
  205. dtype=lora_config.lora_dtype,
  206. device=self.base_layer.weight.device,
  207. )
  208. self.lora_a_stacked_2d = self.lora_a_stacked.view(
  209. self.lora_a_stacked.shape[0] * self.lora_a_stacked.shape[1],
  210. self.lora_a_stacked.shape[2],
  211. )
  212. self.indices: Optional[torch.Tensor] = None
  213. self.indices_len: Optional[List[int]] = None
  214. self.embeddings_indices = None
  215. def reset_lora(self, index: int):
  216. self.lora_a_stacked[index] = 0
  217. self.lora_b_stacked[index] = 0
  218. self.embeddings_tensors[index] = 0
  219. def set_lora(
  220. self,
  221. index: int,
  222. lora_a: torch.Tensor,
  223. lora_b: torch.Tensor,
  224. embeddings_tensor: Optional[torch.Tensor],
  225. ):
  226. self.reset_lora(index)
  227. self.lora_a_stacked[index, :lora_a.shape[0], :lora_a.shape[1]].copy_(
  228. lora_a, non_blocking=True)
  229. self.lora_b_stacked[index,
  230. 0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
  231. lora_b.T, non_blocking=True)
  232. if embeddings_tensor is not None:
  233. self.embeddings_tensors[
  234. index, :embeddings_tensor.shape[0], :embeddings_tensor.
  235. shape[1]].copy_(embeddings_tensor, non_blocking=True)
  236. if self.embeddings_slice is not None:
  237. # TODO(yard1): Optimize this copy, we don't need to copy
  238. # everything, just the modified part
  239. embeddings = self.embeddings_tensors.view(
  240. self.embeddings_tensors.shape[0] *
  241. self.embeddings_tensors.shape[1],
  242. self.embeddings_tensors.shape[2]
  243. )[self.embeddings_slice[0]:self.embeddings_slice[1]]
  244. self.embeddings_weights[:embeddings.shape[0]].copy_(embeddings)
  245. def set_mapping(
  246. self,
  247. base_indices: torch.Tensor,
  248. sampler_indices: torch.Tensor,
  249. sampler_indices_padded: torch.Tensor,
  250. embeddings_indices: torch.Tensor,
  251. indices_len: List[int],
  252. ):
  253. self.indices = base_indices
  254. self.embeddings_indices = embeddings_indices
  255. self.indices_len = indices_len
  256. def forward(self, x: torch.Tensor) -> torch.Tensor:
  257. added_tokens_mask = x > self.base_layer.org_vocab_size - 1
  258. embedding_len = self.indices_len[3]
  259. indices = self.embeddings_indices[1][:embedding_len].view_as(x)
  260. full_lora_a_embeddings = F.embedding(
  261. x + indices,
  262. self.lora_a_stacked_2d,
  263. )
  264. indices = self.embeddings_indices[0][:embedding_len].view_as(x)
  265. full_output = self.base_layer.forward(
  266. x.add_(indices * added_tokens_mask))
  267. full_output_org = full_output
  268. if full_output.ndim == 3:
  269. full_output = full_output.view(
  270. full_output.shape[0] * full_output.shape[1], -1)
  271. if full_lora_a_embeddings.ndim == 3:
  272. full_lora_a_embeddings = full_lora_a_embeddings.view(
  273. full_lora_a_embeddings.shape[0] *
  274. full_lora_a_embeddings.shape[1], -1)
  275. bgmv(full_output, full_lora_a_embeddings, self.lora_b_stacked,
  276. self.indices[:self.indices_len[0]], 0, 1.0)
  277. return full_output.view_as(full_output_org)
  278. @classmethod
  279. def can_replace_layer(cls, source_layer: nn.Module,
  280. lora_config: LoRAConfig, packed_modules_list: List,
  281. model_config: Optional[PretrainedConfig]) -> bool:
  282. return type(source_layer) is VocabParallelEmbedding
  283. class ColumnParallelLinearWithLoRA(BaseLayerWithLoRA):
  284. def __init__(self, base_layer: ColumnParallelLinear) -> None:
  285. super().__init__()
  286. self.base_layer = base_layer
  287. self.tp_size = get_tensor_model_parallel_world_size()
  288. def create_lora_weights(
  289. self,
  290. max_loras: int,
  291. lora_config: LoRAConfig,
  292. model_config: Optional[PretrainedConfig] = None) -> None:
  293. self.lora_a_stacked = torch.zeros(
  294. max_loras,
  295. 1,
  296. lora_config.max_lora_rank,
  297. self.base_layer.weight.shape[1],
  298. dtype=lora_config.lora_dtype,
  299. device=self.base_layer.weight.device,
  300. )
  301. self.lora_b_stacked = torch.zeros(
  302. max_loras,
  303. 1,
  304. self.base_layer.weight.shape[0],
  305. lora_config.max_lora_rank,
  306. dtype=lora_config.lora_dtype,
  307. device=self.base_layer.weight.device,
  308. )
  309. self.indices: Optional[torch.Tensor] = None
  310. self.indices_len: Optional[List[int]] = None
  311. self.output_dim = self.lora_b_stacked.shape[2]
  312. def reset_lora(self, index: int):
  313. self.lora_a_stacked[index] = 0
  314. self.lora_b_stacked[index] = 0
  315. def set_lora(
  316. self,
  317. index: int,
  318. lora_a: torch.Tensor,
  319. lora_b: torch.Tensor,
  320. embeddings_tensor: Optional[torch.Tensor],
  321. ):
  322. self.reset_lora(index)
  323. if self.tp_size > 1:
  324. tensor_model_parallel_rank = get_tensor_model_parallel_rank()
  325. shard_size = self.output_dim
  326. start_idx = tensor_model_parallel_rank * shard_size
  327. end_idx = (tensor_model_parallel_rank + 1) * shard_size
  328. lora_b = lora_b[:, start_idx:end_idx]
  329. self.lora_a_stacked[index,
  330. 0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
  331. lora_a.T, non_blocking=True)
  332. self.lora_b_stacked[index,
  333. 0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
  334. lora_b.T, non_blocking=True)
  335. def set_mapping(
  336. self,
  337. base_indices: torch.Tensor,
  338. sampler_indices: torch.Tensor,
  339. sampler_indices_padded: torch.Tensor,
  340. embeddings_indices: torch.Tensor,
  341. indices_len: List[int],
  342. ):
  343. self.indices = base_indices
  344. self.indices_len = indices_len
  345. def apply_weights(self, x: torch.Tensor,
  346. bias: Optional[torch.Tensor]) -> torch.Tensor:
  347. output = self.base_layer.linear_method.apply_weights(
  348. self.base_layer.linear_weights, x, bias)
  349. _apply_lora(
  350. x,
  351. self.lora_a_stacked,
  352. self.lora_b_stacked,
  353. self.indices[:self.indices_len[0]],
  354. output,
  355. )
  356. return output
  357. def forward(self, input_):
  358. """Forward of ColumnParallelLinear
  359. Args:
  360. input_: Tensor whose last dimension is `input_size`.
  361. Returns:
  362. - output
  363. - bias
  364. """
  365. bias = (self.base_layer.bias
  366. if not self.base_layer.skip_bias_add else None)
  367. # Matrix multiply.
  368. output_parallel = self.apply_weights(input_, bias)
  369. if self.base_layer.gather_output:
  370. # All-gather across the partitions.
  371. output = tensor_model_parallel_all_gather(output_parallel)
  372. else:
  373. output = output_parallel
  374. output_bias = (self.base_layer.bias
  375. if self.base_layer.skip_bias_add else None)
  376. return output, output_bias
  377. @property
  378. def linear_weights(self):
  379. return self.base_layer.linear_weights
  380. @classmethod
  381. def can_replace_layer(cls, source_layer: nn.Module,
  382. lora_config: LoRAConfig, packed_modules_list: List,
  383. model_config: Optional[PretrainedConfig]) -> bool:
  384. return type(source_layer) is ColumnParallelLinear or (
  385. type(source_layer) is MergedColumnParallelLinear
  386. and len(packed_modules_list) == 1)
  387. class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
  388. """ColumnParallelLinear layer that is composed of 2 sublayers (slices)
  389. packed together (eg. gate_proj + up_proj -> gate_up_proj).
  390. This means we have 2 LoRAs, each applied to one half of the layer.
  391. Both slices must have the same size.
  392. """
  393. def __init__(self, base_layer: MergedColumnParallelLinear) -> None:
  394. super().__init__(base_layer)
  395. def create_lora_weights(
  396. self,
  397. max_loras: int,
  398. lora_config: LoRAConfig,
  399. model_config: Optional[PretrainedConfig] = None) -> None:
  400. n_slices = 2
  401. if not (len(self.base_layer.output_sizes) == n_slices
  402. and self.base_layer.output_sizes[0]
  403. == self.base_layer.output_sizes[1]):
  404. raise ValueError(
  405. "LoRAColumnParallelLinear2Slice requires 2 slices with "
  406. "the same size.")
  407. self.tp_size = get_tensor_model_parallel_world_size()
  408. device = _get_lora_device(self.base_layer)
  409. self.lora_a_stacked = tuple(
  410. torch.zeros(
  411. max_loras,
  412. 1,
  413. lora_config.max_lora_rank,
  414. self.base_layer.input_size,
  415. dtype=lora_config.lora_dtype,
  416. device=device,
  417. ) for _ in range(n_slices))
  418. self.lora_b_stacked = tuple(
  419. torch.zeros(
  420. max_loras,
  421. 1,
  422. self.base_layer.output_size // 2,
  423. lora_config.max_lora_rank,
  424. dtype=lora_config.lora_dtype,
  425. device=device,
  426. ) for _ in range(n_slices))
  427. self.indices: Optional[torch.Tensor] = None
  428. self.output_dim = self.lora_b_stacked[0].shape[2]
  429. def reset_lora(self, index: int):
  430. self.lora_a_stacked[0][index] = 0
  431. self.lora_a_stacked[1][index] = 0
  432. self.lora_b_stacked[0][index] = 0
  433. self.lora_b_stacked[1][index] = 0
  434. def set_lora(
  435. self,
  436. index: int,
  437. lora_a: torch.Tensor,
  438. lora_b: torch.Tensor,
  439. embeddings_tensor: Optional[torch.Tensor],
  440. ):
  441. self.reset_lora(index)
  442. if self.tp_size > 1:
  443. tensor_model_parallel_rank = get_tensor_model_parallel_rank()
  444. shard_size = self.output_dim
  445. start_idx = tensor_model_parallel_rank * shard_size
  446. end_idx = (tensor_model_parallel_rank + 1) * shard_size
  447. lora_b = lora_b[0][:,
  448. start_idx:end_idx], lora_b[1][:,
  449. start_idx:end_idx]
  450. if lora_a[0] is not None:
  451. self.lora_a_stacked[0][
  452. index, 0, :lora_a[0].shape[1], :lora_a[0].shape[0]].copy_(
  453. lora_a[0].T, non_blocking=True)
  454. self.lora_b_stacked[0][
  455. index, 0, :lora_b[0].shape[1], :lora_b[0].shape[0]].copy_(
  456. lora_b[0].T, non_blocking=True)
  457. if lora_a[1] is not None:
  458. self.lora_a_stacked[1][
  459. index, 0, :lora_a[1].shape[1], :lora_a[1].shape[0]].copy_(
  460. lora_a[1].T, non_blocking=True)
  461. self.lora_b_stacked[1][
  462. index, 0, :lora_b[1].shape[1], :lora_b[1].shape[0]].copy_(
  463. lora_b[1].T, non_blocking=True)
  464. def apply_weights(self, x: torch.Tensor,
  465. bias: Optional[torch.Tensor]) -> torch.Tensor:
  466. output = self.base_layer.linear_method.apply_weights(
  467. self.base_layer.linear_weights, x, bias)
  468. _apply_lora_packed_nslice(
  469. x,
  470. self.lora_a_stacked,
  471. self.lora_b_stacked,
  472. self.indices[:self.indices_len[0]],
  473. output,
  474. (self.output_dim, self.output_dim),
  475. )
  476. return output
  477. @classmethod
  478. def can_replace_layer(cls, source_layer: nn.Module,
  479. lora_config: LoRAConfig, packed_modules_list: List,
  480. model_config: Optional[PretrainedConfig]) -> bool:
  481. return type(source_layer) is MergedColumnParallelLinear and len(
  482. packed_modules_list) == 2
  483. class QKVParallelLinearWithLora(ColumnParallelLinearWithLoRA):
  484. """
  485. ColumnParallelLinear layer that is specifically designed for
  486. qkv_proj. Certain models, such as chtglm3 and baichuan-7b,
  487. only contains a single LoRA within their qkv_proj layer.
  488. During inference with Tensor Parallel, the weights of lora_b
  489. must be accurately partitioned according to the respective ranks.
  490. Q slice may have different shape than K and V slices (which both have
  491. the same shape).
  492. """
  493. def __init__(self, base_layer: QKVParallelLinear) -> None:
  494. super().__init__(base_layer)
  495. self.tp_size = get_tensor_model_parallel_world_size()
  496. self.q_proj_total_size = (self.base_layer.total_num_heads *
  497. self.base_layer.head_size)
  498. self.q_proj_shard_size = (self.base_layer.num_heads *
  499. self.base_layer.head_size)
  500. self.kv_proj_shard_size = (self.base_layer.num_kv_heads *
  501. self.base_layer.head_size)
  502. self.kv_proj_total_size = (self.base_layer.total_num_kv_heads *
  503. self.base_layer.head_size)
  504. def set_lora(
  505. self,
  506. index: int,
  507. lora_a: torch.Tensor,
  508. lora_b: torch.Tensor,
  509. embeddings_tensor: Optional[torch.Tensor],
  510. ):
  511. self.reset_lora(index)
  512. if self.tp_size > 1:
  513. tp_rank = get_tensor_model_parallel_rank()
  514. self.q_shard_id = tp_rank
  515. self.kv_shard_id = tp_rank // self.base_layer.num_kv_head_replicas
  516. lora_b_q = lora_b[:, self.q_proj_shard_size *
  517. self.q_shard_id:self.q_proj_shard_size *
  518. (self.q_shard_id + 1)]
  519. k_offset = self.q_proj_total_size
  520. lora_b_k = lora_b[:, k_offset + self.kv_proj_shard_size *
  521. self.kv_shard_id:k_offset +
  522. self.kv_proj_shard_size * (self.kv_shard_id + 1)]
  523. v_offset = k_offset + self.kv_proj_total_size
  524. lora_b_v = lora_b[:, v_offset + self.kv_proj_shard_size *
  525. self.kv_shard_id:v_offset +
  526. self.kv_proj_shard_size * (self.kv_shard_id + 1)]
  527. lora_b = torch.cat([lora_b_q, lora_b_k, lora_b_v], dim=1)
  528. self.lora_a_stacked[index,
  529. 0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
  530. lora_a.T, non_blocking=True)
  531. self.lora_b_stacked[index,
  532. 0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
  533. lora_b.T, non_blocking=True)
  534. @classmethod
  535. def can_replace_layer(cls, source_layer: nn.Module,
  536. lora_config: LoRAConfig, packed_modules_list: List,
  537. model_config: Optional[PretrainedConfig]) -> bool:
  538. return type(source_layer) is QKVParallelLinear and len(
  539. packed_modules_list) == 1
  540. class MergedQKVParallelLinearWithLora(ColumnParallelLinearWithLoRA):
  541. """ColumnParallelLinear layer that is composed of 3 sublayers (slices)
  542. packed together in qkv proj fashion
  543. (q_proj + k_proj + v_proj -> qkv_proj).
  544. This means we have 3 LoRAs, each applied to one slice of the layer.
  545. Q slice may have different shape than K and V slices (which both have
  546. the same shape).
  547. """
  548. def __init__(self, base_layer: QKVParallelLinear) -> None:
  549. super().__init__(base_layer)
  550. def create_lora_weights(
  551. self,
  552. max_loras: int,
  553. lora_config: LoRAConfig,
  554. model_config: Optional[PretrainedConfig] = None) -> None:
  555. self.tp_size = get_tensor_model_parallel_world_size()
  556. tp_rank = get_tensor_model_parallel_rank()
  557. self.q_proj_shard_size = (self.base_layer.num_heads *
  558. self.base_layer.head_size)
  559. self.kv_proj_shard_size = (self.base_layer.num_kv_heads *
  560. self.base_layer.head_size)
  561. self.q_shard_id = tp_rank
  562. self.kv_shard_id = tp_rank // self.base_layer.num_kv_head_replicas
  563. device = _get_lora_device(self.base_layer)
  564. # q, k, v
  565. self.lora_a_stacked = (
  566. torch.zeros(
  567. max_loras,
  568. 1,
  569. lora_config.max_lora_rank,
  570. self.base_layer.input_size,
  571. dtype=lora_config.lora_dtype,
  572. device=device,
  573. ),
  574. torch.zeros(
  575. max_loras,
  576. 1,
  577. lora_config.max_lora_rank,
  578. self.base_layer.input_size,
  579. dtype=lora_config.lora_dtype,
  580. device=device,
  581. ),
  582. torch.zeros(
  583. max_loras,
  584. 1,
  585. lora_config.max_lora_rank,
  586. self.base_layer.input_size,
  587. dtype=lora_config.lora_dtype,
  588. device=device,
  589. ),
  590. )
  591. self.lora_b_stacked = (
  592. torch.zeros(
  593. max_loras,
  594. 1,
  595. self.q_proj_shard_size,
  596. lora_config.max_lora_rank,
  597. dtype=lora_config.lora_dtype,
  598. device=device,
  599. ),
  600. torch.zeros(
  601. max_loras,
  602. 1,
  603. self.kv_proj_shard_size,
  604. lora_config.max_lora_rank,
  605. dtype=lora_config.lora_dtype,
  606. device=device,
  607. ),
  608. torch.zeros(
  609. max_loras,
  610. 1,
  611. self.kv_proj_shard_size,
  612. lora_config.max_lora_rank,
  613. dtype=lora_config.lora_dtype,
  614. device=device,
  615. ),
  616. )
  617. self.output_slices = (self.q_proj_shard_size, self.kv_proj_shard_size,
  618. self.kv_proj_shard_size)
  619. self.packed_indices: Optional[torch.Tensor] = None
  620. self.standard_indices: Optional[torch.Tensor] = None
  621. self.indices_len: Optional[List[int]] = None
  622. def reset_lora(self, index: int):
  623. self.lora_a_stacked[0][index] = 0
  624. self.lora_b_stacked[0][index] = 0
  625. self.lora_a_stacked[1][index] = 0
  626. self.lora_b_stacked[1][index] = 0
  627. self.lora_a_stacked[2][index] = 0
  628. self.lora_b_stacked[2][index] = 0
  629. def set_lora(
  630. self,
  631. index: int,
  632. lora_a: torch.Tensor,
  633. lora_b: torch.Tensor,
  634. embeddings_tensor: Optional[torch.Tensor],
  635. ):
  636. self.reset_lora(index)
  637. if self.tp_size > 1:
  638. if lora_b[0] is not None:
  639. lora_b_q = lora_b[0][:, self.q_proj_shard_size *
  640. self.q_shard_id:self.q_proj_shard_size *
  641. (self.q_shard_id + 1)]
  642. self.lora_b_stacked[0][
  643. index, 0, :lora_b_q.shape[1], :lora_b_q.shape[0]].copy_(
  644. lora_b_q.T, non_blocking=True)
  645. if lora_b[1] is not None:
  646. lora_b_k = lora_b[1][:, self.kv_proj_shard_size *
  647. self.kv_shard_id:self.kv_proj_shard_size *
  648. (self.kv_shard_id + 1)]
  649. self.lora_b_stacked[1][
  650. index, 0, :lora_b_k.shape[1], :lora_b_k.shape[0]].copy_(
  651. lora_b_k.T, non_blocking=True)
  652. if lora_b[2] is not None:
  653. lora_b_v = lora_b[2][:, self.kv_proj_shard_size *
  654. self.kv_shard_id:self.kv_proj_shard_size *
  655. (self.kv_shard_id + 1)]
  656. self.lora_b_stacked[2][
  657. index, 0, :lora_b_v.shape[1], :lora_b_v.shape[0]].copy_(
  658. lora_b_v.T, non_blocking=True)
  659. else:
  660. if lora_b[0] is not None:
  661. self.lora_b_stacked[0][
  662. index, 0, :lora_b[0].shape[1], :lora_b[0].shape[0]].copy_(
  663. lora_b[0].T, non_blocking=True)
  664. if lora_b[1] is not None:
  665. self.lora_b_stacked[1][
  666. index, 0, :lora_b[1].shape[1], :lora_b[1].shape[0]].copy_(
  667. lora_b[1].T, non_blocking=True)
  668. if lora_b[2] is not None:
  669. self.lora_b_stacked[2][
  670. index, 0, :lora_b[2].shape[1], :lora_b[2].shape[0]].copy_(
  671. lora_b[2].T, non_blocking=True)
  672. if lora_a[0] is not None:
  673. self.lora_a_stacked[0][
  674. index, 0, :lora_a[0].shape[1], :lora_a[0].shape[0]].copy_(
  675. lora_a[0].T, non_blocking=True)
  676. if lora_a[1] is not None:
  677. self.lora_a_stacked[1][
  678. index, 0, :lora_a[1].shape[1], :lora_a[1].shape[0]].copy_(
  679. lora_a[1].T, non_blocking=True)
  680. if lora_a[2] is not None:
  681. self.lora_a_stacked[2][
  682. index, 0, :lora_a[2].shape[1], :lora_a[2].shape[0]].copy_(
  683. lora_a[2].T, non_blocking=True)
  684. def apply_weights(self, x: torch.Tensor,
  685. bias: Optional[torch.Tensor]) -> torch.Tensor:
  686. output = self.base_layer.linear_method.apply_weights(
  687. self.base_layer.linear_weights, x, bias)
  688. _apply_lora_packed_nslice(
  689. x,
  690. self.lora_a_stacked,
  691. self.lora_b_stacked,
  692. self.indices[:self.indices_len[0]],
  693. output,
  694. self.output_slices,
  695. )
  696. return output
  697. @classmethod
  698. def can_replace_layer(cls, source_layer: nn.Module,
  699. lora_config: LoRAConfig, packed_modules_list: List,
  700. model_config: Optional[PretrainedConfig]) -> bool:
  701. return type(source_layer) is QKVParallelLinear and len(
  702. packed_modules_list) == 3
  703. class RowParallelLinearWithLoRA(BaseLayerWithLoRA):
  704. def __init__(self, base_layer: RowParallelLinear) -> None:
  705. super().__init__()
  706. self.base_layer = base_layer
  707. def create_lora_weights(
  708. self,
  709. max_loras: int,
  710. lora_config: LoRAConfig,
  711. model_config: Optional[PretrainedConfig] = None) -> None:
  712. device = _get_lora_device(self.base_layer)
  713. self.lora_a_stacked = torch.zeros(
  714. (
  715. max_loras,
  716. 1,
  717. lora_config.max_lora_rank,
  718. self.base_layer.input_size,
  719. ),
  720. dtype=lora_config.lora_dtype,
  721. device=device,
  722. )
  723. self.lora_b_stacked = torch.zeros(
  724. (
  725. max_loras,
  726. 1,
  727. self.base_layer.output_size,
  728. lora_config.max_lora_rank,
  729. ),
  730. dtype=lora_config.lora_dtype,
  731. device=device,
  732. )
  733. self.indices: Optional[torch.Tensor] = None
  734. self.indices_len: Optional[List[int]] = None
  735. def reset_lora(self, index: int):
  736. self.lora_a_stacked[index] = 0
  737. self.lora_b_stacked[index] = 0
  738. def set_lora(
  739. self,
  740. index: int,
  741. lora_a: torch.Tensor,
  742. lora_b: torch.Tensor,
  743. embeddings_tensor: Optional[torch.Tensor],
  744. ):
  745. self.reset_lora(index)
  746. if self.base_layer.tp_size > 1:
  747. tensor_model_parallel_rank = get_tensor_model_parallel_rank()
  748. shard_size = self.base_layer.weight.shape[1]
  749. start_idx = tensor_model_parallel_rank * shard_size
  750. end_idx = (tensor_model_parallel_rank + 1) * shard_size
  751. lora_a = lora_a[start_idx:end_idx, :]
  752. self.lora_a_stacked[index,
  753. 0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
  754. lora_a.T, non_blocking=True)
  755. self.lora_b_stacked[index,
  756. 0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
  757. lora_b.T, non_blocking=True)
  758. def set_mapping(
  759. self,
  760. base_indices: torch.Tensor,
  761. sampler_indices: torch.Tensor,
  762. sampler_indices_padded: torch.Tensor,
  763. embeddings_indices: torch.Tensor,
  764. indices_len: List[int],
  765. ):
  766. self.indices = base_indices
  767. self.indices_len = indices_len
  768. def apply_weights(self, x: torch.Tensor) -> torch.Tensor:
  769. output = self.base_layer.linear_method.apply_weights(
  770. self.base_layer.linear_weights, x)
  771. _apply_lora(
  772. x,
  773. self.lora_a_stacked,
  774. self.lora_b_stacked,
  775. self.indices[:self.indices_len[0]],
  776. output,
  777. )
  778. return output
  779. def forward(self, input_):
  780. """Forward of RowParallelLinear
  781. Args:
  782. input_: tensor whose last dimension is `input_size`. If
  783. `input_is_parallel` is set, then the last dimension
  784. is `input_size // tp_size`.
  785. Returns:
  786. - output
  787. - bias
  788. """
  789. # Set up backprop all-reduce.
  790. if self.base_layer.input_is_parallel:
  791. input_parallel = input_
  792. else:
  793. # TODO: simplify code below
  794. tp_rank = get_tensor_model_parallel_rank()
  795. splitted_input = split_tensor_along_last_dim(
  796. input_, num_partitions=self.base_layer.tp_size)
  797. input_parallel = splitted_input[tp_rank].contiguous()
  798. # Matrix multiply.
  799. output_parallel = self.apply_weights(input_parallel)
  800. if self.base_layer.reduce_results and self.base_layer.tp_size > 1:
  801. output_ = tensor_model_parallel_all_reduce(output_parallel)
  802. else:
  803. output_ = output_parallel
  804. if not self.base_layer.skip_bias_add:
  805. output = (output_ + self.base_layer.bias
  806. if self.base_layer.bias is not None else output_)
  807. output_bias = None
  808. else:
  809. output = output_
  810. output_bias = self.base_layer.bias
  811. return output, output_bias
  812. @property
  813. def weight(self):
  814. return self.base_layer.weight
  815. @classmethod
  816. def can_replace_layer(cls, source_layer: nn.Module,
  817. lora_config: LoRAConfig, packed_modules_list: List,
  818. model_config: Optional[PretrainedConfig]) -> bool:
  819. return type(source_layer) is RowParallelLinear
  820. class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
  821. def __init__(
  822. self,
  823. base_layer: LogitsProcessor,
  824. hidden_size: int,
  825. dtype: torch.dtype,
  826. device: torch.device,
  827. ) -> None:
  828. super().__init__()
  829. self.base_layer = base_layer
  830. self.hidden_size = hidden_size
  831. self.dtype = dtype
  832. self.device = device
  833. @property
  834. def logits_as_input(self):
  835. return self.base_layer.logits_as_input
  836. @property
  837. def vocab_size(self):
  838. return self.base_layer.vocab_size
  839. @property
  840. def scale(self):
  841. return self.base_layer.scale
  842. @property
  843. def org_vocab_size(self):
  844. return self.base_layer.org_vocab_size
  845. @property
  846. def include_gpu_probs_tensor(self):
  847. return self.base_layer.include_gpu_probs_tensor
  848. def create_lora_weights(
  849. self,
  850. max_loras: int,
  851. lora_config: LoRAConfig,
  852. model_config: Optional[PretrainedConfig] = None,
  853. ) -> None:
  854. # Keep this in sync with csrc/punica/bgmv/bgmv_config.h
  855. if 32000 < self.base_layer.vocab_size > 128512:
  856. raise ValueError("When using LoRA, vocab size must be "
  857. "32000 >= vocab_size <= 128512")
  858. self.lora_a_stacked = torch.zeros(
  859. (
  860. max_loras,
  861. 1,
  862. lora_config.max_lora_rank,
  863. self.hidden_size,
  864. ),
  865. dtype=lora_config.lora_dtype,
  866. device=self.device,
  867. )
  868. self.lora_b_stacked = torch.zeros(
  869. (
  870. max_loras,
  871. 1,
  872. # Pad for kernel compatibility
  873. math.ceil(self.base_layer.vocab_size /
  874. lora_config.lora_vocab_padding_size) *
  875. lora_config.lora_vocab_padding_size,
  876. lora_config.max_lora_rank,
  877. ),
  878. dtype=lora_config.lora_dtype,
  879. device=self.device,
  880. )
  881. self.embeddings_tensors = torch.full(
  882. (max_loras, lora_config.lora_extra_vocab_size, self.hidden_size),
  883. fill_value=float("-inf"),
  884. dtype=self.dtype,
  885. device=self.device,
  886. )
  887. self.indices = None
  888. self.indices_padded = None
  889. self.indices_len = None
  890. def reset_lora(self, index: int):
  891. self.lora_a_stacked[index] = 0
  892. self.lora_b_stacked[index] = 0
  893. self.embeddings_tensors[index] = float("-inf")
  894. def set_lora(
  895. self,
  896. index: int,
  897. lora_a: torch.Tensor,
  898. lora_b: torch.Tensor,
  899. embeddings_tensor: Optional[torch.Tensor],
  900. ):
  901. self.reset_lora(index)
  902. self.lora_a_stacked[index,
  903. 0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
  904. lora_a.T, non_blocking=True)
  905. self.lora_b_stacked[index,
  906. 0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
  907. lora_b.T, non_blocking=True)
  908. if embeddings_tensor is not None:
  909. self.embeddings_tensors[
  910. index, :embeddings_tensor.shape[0], :embeddings_tensor.
  911. shape[1], ] = embeddings_tensor
  912. def set_mapping(
  913. self,
  914. base_indices: torch.Tensor,
  915. sampler_indices: torch.Tensor,
  916. sampler_indices_padded: torch.Tensor,
  917. embeddings_indices: torch.Tensor,
  918. indices_len: List[int],
  919. ):
  920. self.indices = sampler_indices
  921. self.indices_padded = sampler_indices_padded
  922. self.indices_len = indices_len
  923. def _get_logits(
  924. self,
  925. hidden_states: torch.Tensor,
  926. lm_head: torch.Tensor,
  927. embedding_bias: Optional[torch.Tensor] = None,
  928. ) -> Optional[torch.Tensor]:
  929. # Get the logits for the next tokens.
  930. logits = lm_head(hidden_states)
  931. if embedding_bias is not None:
  932. logits += embedding_bias
  933. logits = tensor_model_parallel_gather(logits)
  934. if logits is None:
  935. return None
  936. lora_logits = torch.empty(
  937. self.embeddings_tensors.shape[0] + 1,
  938. self.embeddings_tensors.shape[1],
  939. hidden_states.shape[0],
  940. dtype=self.embeddings_tensors.dtype,
  941. device=self.embeddings_tensors.device,
  942. )
  943. torch.matmul(self.embeddings_tensors,
  944. hidden_states.T,
  945. out=lora_logits[:-1])
  946. lora_logits[-1] = float("-inf")
  947. lora_logits = lora_logits.mT
  948. lora_logits = (lora_logits.reshape(
  949. lora_logits.shape[0] * lora_logits.shape[1],
  950. lora_logits.shape[2],
  951. ).index_select(0,
  952. self.indices_padded[:self.indices_len[2]]).nan_to_num_(
  953. nan=float("-inf"),
  954. posinf=float("inf"),
  955. neginf=float("-inf")))
  956. logits[:,
  957. self.base_layer.org_vocab_size:self.base_layer.org_vocab_size +
  958. lora_logits.shape[1]] = lora_logits
  959. _apply_lora(
  960. hidden_states,
  961. self.lora_a_stacked,
  962. self.lora_b_stacked,
  963. self.indices[:self.indices_len[1]],
  964. logits,
  965. )
  966. # Remove paddings in vocab (if any).
  967. logits = logits[:, :self.base_layer.vocab_size]
  968. return logits
  969. def forward(self, *args, **kwargs):
  970. return type(self.base_layer).forward(self, *args, **kwargs)
  971. @classmethod
  972. def can_replace_layer(cls, source_layer: nn.Module,
  973. lora_config: LoRAConfig, packed_modules_list: List,
  974. model_config: Optional[PretrainedConfig]) -> bool:
  975. # Special handling for the LogitsProcessor.
  976. return False
  977. _all_lora_classes: Set[Type[BaseLayerWithLoRA]] = {
  978. cls
  979. for cls in globals().values() if inspect.isclass(cls)
  980. and issubclass(cls, BaseLayerWithLoRA) and cls is not BaseLayerWithLoRA
  981. }
  982. def from_layer(layer: nn.Module,
  983. max_loras: int,
  984. lora_config: LoRAConfig,
  985. packed_modules_list: List,
  986. model_config: Optional[PretrainedConfig] = None) -> nn.Module:
  987. for lora_cls in _all_lora_classes:
  988. if lora_cls.can_replace_layer(layer, lora_config, packed_modules_list,
  989. model_config):
  990. ret = lora_cls(layer)
  991. ret.create_lora_weights(max_loras, lora_config, model_config)
  992. return ret
  993. return layer
  994. def from_layer_logits_processor(
  995. layer: LogitsProcessor,
  996. lm_head: ParallelLMHead,
  997. max_loras: int,
  998. lora_config: LoRAConfig,
  999. model_config: Optional[PretrainedConfig] = None,
  1000. ) -> LogitsProcessorWithLoRA:
  1001. ret = LogitsProcessorWithLoRA(layer, lm_head.embedding_dim,
  1002. lm_head.weight.dtype, lm_head.weight.device)
  1003. ret.create_lora_weights(max_loras, lora_config, model_config)
  1004. return ret