sgmv_expand_slice.py 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213
  1. """
  2. Based on:
  3. Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023).
  4. Punica: Multi-Tenant LoRA Serving.
  5. https://arxiv.org/abs/2310.18547
  6. """
  7. import torch
  8. import triton
  9. import triton.language as tl
  10. from aphrodite.triton_utils import libentry
  11. @libentry()
  12. @triton.jit
  13. def _sgmv_expand_slice_kernel(
  14. input_ptr,
  15. lora_ptr,
  16. out_ptr,
  17. N,
  18. K,
  19. b_seq_start_loc,
  20. seq_lens,
  21. lora_indices,
  22. xm_stride,
  23. xk_stride, # 1
  24. l0_stride, # hidden_size*max_rank
  25. lora_k_stride,
  26. lora_n_stride,
  27. cm_stride,
  28. cn_stride,
  29. slice_offset,
  30. BLOCK_M: tl.constexpr,
  31. BLOCK_N: tl.constexpr,
  32. BLOCK_K: tl.constexpr,
  33. EVEN_K: tl.constexpr,
  34. ADD_INPUTS: tl.constexpr,
  35. CAST_TYPE: tl.constexpr,
  36. ):
  37. """
  38. Similar to the 'sgmv_expand' operator, but with an added parameter
  39. 'slice_offset'. The reason for not reusing the 'sgmv_expand' operator
  40. might be that in the future, we could implement a fusion operator to
  41. achieve the current functionality instead of having to call it multiple
  42. times.
  43. """
  44. pid = tl.program_id(axis=0)
  45. cur_batch = tl.program_id(axis=1)
  46. cta_n_num = tl.cdiv(N, BLOCK_N)
  47. pid_m = pid // cta_n_num
  48. pid_n = pid % cta_n_num
  49. M = tl.load(seq_lens + cur_batch)
  50. if pid_m * BLOCK_M > M:
  51. return
  52. lora_index = tl.load(lora_indices + cur_batch)
  53. if lora_index == -1:
  54. return
  55. cur_seq_start = tl.load(b_seq_start_loc + cur_batch)
  56. offset_m = tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
  57. offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
  58. offset_k = tl.arange(0, BLOCK_K)
  59. ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M)
  60. rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N)
  61. a_ptr = (input_ptr + cur_seq_start * xm_stride + ram[:, None] * xm_stride +
  62. offset_k[None, :] * xk_stride, )
  63. b_ptr = (lora_ptr + l0_stride * lora_index +
  64. offset_k[:, None] * lora_n_stride + rbn[None, :] * lora_k_stride)
  65. accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
  66. for k in range(tl.cdiv(K, BLOCK_K)):
  67. if EVEN_K:
  68. tiled_a = tl.load(a_ptr)
  69. tiled_b = tl.load(b_ptr)
  70. else:
  71. tiled_a = tl.load(a_ptr,
  72. mask=offset_k[None, :] < K - k * BLOCK_K,
  73. other=0)
  74. tiled_b = tl.load(b_ptr,
  75. mask=offset_k[:, None] < K - k * BLOCK_K,
  76. other=0)
  77. if CAST_TYPE:
  78. tiled_a = tiled_a.to(lora_ptr.dtype.element_ty)
  79. accumulator += tl.dot(
  80. tiled_a,
  81. tiled_b,
  82. )
  83. a_ptr += BLOCK_K * xk_stride
  84. b_ptr += BLOCK_K * lora_n_stride
  85. tiled_c = accumulator.to(lora_ptr.dtype.element_ty)
  86. offset_cm = cur_seq_start + tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
  87. offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N + slice_offset
  88. c_ptr = (out_ptr + offset_cm[:, None] * cm_stride +
  89. offset_cn[None, :] * cn_stride)
  90. M = tl.load(seq_lens + cur_batch)
  91. c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & (offset_cn[None, :] <
  92. (slice_offset + N))
  93. if ADD_INPUTS:
  94. tiled_out = tl.load(c_ptr, mask=c_mask)
  95. tiled_c += tiled_out
  96. tl.store(c_ptr, tiled_c, mask=c_mask)
  97. @torch.inference_mode()
  98. def _sgmv_expand_slice(
  99. inputs: torch.Tensor,
  100. lora_b_weights: torch.Tensor,
  101. output_tensor: torch.Tensor,
  102. b_seq_start_loc: torch.Tensor,
  103. seq_len_tensor: torch.Tensor,
  104. lora_indices_tensor: torch.Tensor,
  105. batches: int,
  106. max_seq_length: int,
  107. slice_offset: int,
  108. slice_size: int,
  109. add_inputs: bool = False,
  110. ) -> None:
  111. """_summary_
  112. Args:
  113. inputs (torch.Tensor): input tensor
  114. lora_b_weights (torch.Tensor): lora'a weight
  115. output_tensor (torch.Tensor): output tensor
  116. b_seq_start_loc (torch.Tensor): (batch_size,). The cumulative
  117. sequence lengths of the sequences in the batch, used to index
  118. into sequence. E.g.,if the sequence length is [4, 6], it is
  119. [0, 4, 10].
  120. seq_len_tensor (torch.Tensor): (batch_size,). record the sequence
  121. length of the sequences in the batch
  122. lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
  123. corresponding to each batch. An index of -1 means no lora should be
  124. applied.
  125. batches (int): batch size
  126. max_seq_length (int): The max sequence lengths of the sequences
  127. in the batch
  128. slice_offst (int): output_tensor's offst
  129. slice_size (int): current output_tensor's size
  130. add_inputs (bool, optional): Defaults to False. adds the final lora
  131. results to the output..
  132. """
  133. assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32]
  134. assert lora_b_weights.dtype in [
  135. torch.float16,
  136. torch.bfloat16,
  137. ]
  138. assert inputs.size(1) == lora_b_weights.size(-1)
  139. assert b_seq_start_loc.size(0) == batches
  140. assert lora_indices_tensor.size(0) == batches
  141. assert slice_size == lora_b_weights.size(-2)
  142. assert inputs.is_contiguous()
  143. assert output_tensor.is_contiguous()
  144. if lora_b_weights.ndim == 4: # shape:(lora_num,1,size,rank)
  145. assert lora_b_weights.size(1) == 1
  146. lora_b_weights = lora_b_weights.squeeze(dim=1)
  147. else:
  148. assert lora_b_weights.ndim == 3 # shape:(lora_num,size,rank)
  149. assert lora_b_weights.is_contiguous()
  150. # TODO tuning this config
  151. N, K = lora_b_weights.shape[-2:] # K= rank,N=hidden_size
  152. BLOCK_M = 32
  153. BLOCK_N = 32
  154. BLOCK_K = 16
  155. EVEN_K = K % BLOCK_K == 0
  156. ADD_INPUTS = add_inputs
  157. CAST_TYPE = False
  158. if inputs.dtype == torch.float32 and lora_b_weights.dtype in [
  159. torch.float16,
  160. torch.bfloat16,
  161. ]:
  162. CAST_TYPE = True
  163. grid = (
  164. triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N),
  165. batches,
  166. )
  167. _sgmv_expand_slice_kernel[grid](
  168. inputs,
  169. lora_b_weights,
  170. output_tensor,
  171. N,
  172. K,
  173. b_seq_start_loc,
  174. seq_len_tensor,
  175. lora_indices_tensor,
  176. inputs.stride(0),
  177. inputs.stride(1),
  178. lora_b_weights.stride(0),
  179. lora_b_weights.stride(1),
  180. lora_b_weights.stride(2),
  181. output_tensor.stride(0),
  182. output_tensor.stride(1),
  183. slice_offset,
  184. BLOCK_M,
  185. BLOCK_N,
  186. BLOCK_K,
  187. EVEN_K,
  188. ADD_INPUTS,
  189. CAST_TYPE,
  190. )
  191. return
  192. try:
  193. sgmv_expand_slice = torch.library.custom_op("lora::sgmv_expand_slice",
  194. _sgmv_expand_slice,
  195. mutates_args=["output_tensor"])
  196. except AttributeError:
  197. sgmv_expand_slice = _sgmv_expand_slice