sgmv_expand.py 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201
  1. """
  2. Based on:
  3. Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023).
  4. Punica: Multi-Tenant LoRA Serving.
  5. https://arxiv.org/abs/2310.18547
  6. """
  7. import torch
  8. import triton
  9. import triton.language as tl
  10. from aphrodite.triton_utils import libentry
  11. @libentry()
  12. @triton.jit
  13. def _sgmv_expand_kernel(
  14. input_ptr,
  15. lora_ptr,
  16. out_ptr,
  17. N,
  18. K,
  19. b_seq_start_loc,
  20. seq_lens,
  21. lora_indices,
  22. xm_stride,
  23. xk_stride, # 1
  24. l0_stride, # hidden_size*max_rank
  25. lora_k_stride,
  26. lora_n_stride,
  27. cm_stride,
  28. cn_stride,
  29. BLOCK_M: tl.constexpr,
  30. BLOCK_N: tl.constexpr,
  31. BLOCK_K: tl.constexpr,
  32. EVEN_K: tl.constexpr,
  33. ADD_INPUTS: tl.constexpr,
  34. CAST_TYPE: tl.constexpr,
  35. ):
  36. """
  37. The sgmv's expand triton kernel is based on GroupGEMM.
  38. """
  39. pid = tl.program_id(axis=0)
  40. cur_batch = tl.program_id(axis=1)
  41. cta_n_num = tl.cdiv(N, BLOCK_N)
  42. pid_m = pid // cta_n_num
  43. pid_n = pid % cta_n_num
  44. M = tl.load(seq_lens + cur_batch)
  45. if pid_m * BLOCK_M > M:
  46. return
  47. lora_index = tl.load(lora_indices + cur_batch)
  48. if lora_index == -1:
  49. return
  50. cur_seq_start = tl.load(b_seq_start_loc + cur_batch)
  51. offset_m = tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
  52. offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
  53. offset_k = tl.arange(0, BLOCK_K)
  54. ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M)
  55. rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N)
  56. a_ptr = (input_ptr + cur_seq_start * xm_stride + ram[:, None] * xm_stride +
  57. offset_k[None, :] * xk_stride, )
  58. b_ptr = (lora_ptr + l0_stride * lora_index +
  59. offset_k[:, None] * lora_n_stride + rbn[None, :] * lora_k_stride)
  60. accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
  61. for k in range(tl.cdiv(K, BLOCK_K)):
  62. if EVEN_K:
  63. tiled_a = tl.load(a_ptr)
  64. tiled_b = tl.load(b_ptr)
  65. else:
  66. tiled_a = tl.load(a_ptr,
  67. mask=offset_k[None, :] < K - k * BLOCK_K,
  68. other=0)
  69. tiled_b = tl.load(b_ptr,
  70. mask=offset_k[:, None] < K - k * BLOCK_K,
  71. other=0)
  72. if CAST_TYPE:
  73. tiled_a = tiled_a.to(lora_ptr.dtype.element_ty)
  74. accumulator += tl.dot(
  75. tiled_a,
  76. tiled_b,
  77. )
  78. a_ptr += BLOCK_K * xk_stride
  79. b_ptr += BLOCK_K * lora_n_stride
  80. tiled_c = accumulator.to(lora_ptr.dtype.element_ty)
  81. offset_cm = cur_seq_start + tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
  82. offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
  83. c_ptr = (out_ptr + offset_cm[:, None] * cm_stride +
  84. offset_cn[None, :] * cn_stride)
  85. M = tl.load(seq_lens + cur_batch)
  86. c_mask = (offset_cm[:, None] <
  87. (cur_seq_start + M)) & (offset_cn[None, :] < N)
  88. if ADD_INPUTS:
  89. tiled_out = tl.load(c_ptr, mask=c_mask)
  90. tiled_c += tiled_out
  91. tl.store(c_ptr, tiled_c, mask=c_mask)
  92. @torch.inference_mode()
  93. def _sgmv_expand(
  94. inputs: torch.Tensor,
  95. lora_b_weights: torch.Tensor,
  96. output_tensor: torch.Tensor,
  97. b_seq_start_loc: torch.Tensor,
  98. seq_len_tensor: torch.Tensor,
  99. lora_indices_tensor: torch.Tensor,
  100. batches: int,
  101. max_seq_length: int,
  102. add_inputs: bool = False,
  103. ) -> None:
  104. """
  105. Args:
  106. inputs (torch.Tensor): input tensor
  107. lora_b_weights (torch.Tensor): lora'a weight
  108. output_tensor (torch.Tensor): output tensor
  109. b_seq_start_loc (torch.Tensor): (batch_size,). The cumulative
  110. sequence lengths of the sequences in the batch, used to index
  111. into sequence. E.g.,if the sequence length is [4, 6], it is
  112. [0, 4, 10].
  113. seq_len_tensor (torch.Tensor): (batch_size,). record the sequence
  114. length of the sequences in the batch
  115. lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
  116. corresponding to each batch. An index of -1 means no lora should be
  117. applied.
  118. batches (int): batch size
  119. max_seq_length (int): The max sequence lengths of the sequences
  120. in the batch
  121. add_inputs (bool, optional): Defaults to False. adds the final lora
  122. results to the output.
  123. """
  124. assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32]
  125. assert lora_b_weights.dtype in [
  126. torch.float16,
  127. torch.bfloat16,
  128. ]
  129. assert inputs.size(1) == lora_b_weights.size(-1)
  130. assert b_seq_start_loc.size(0) == batches
  131. assert lora_indices_tensor.size(0) == batches
  132. assert inputs.is_contiguous()
  133. assert output_tensor.is_contiguous()
  134. if lora_b_weights.ndim == 4: # shape:(lora_num,1,size,rank)
  135. assert lora_b_weights.size(1) == 1
  136. lora_b_weights = lora_b_weights.squeeze(dim=1)
  137. else:
  138. assert lora_b_weights.ndim == 3 # shape:(lora_num,size,rank)
  139. assert lora_b_weights.is_contiguous()
  140. # TODO tuning this config
  141. N, K = lora_b_weights.shape[-2:] # K= rank,N=hidden_size
  142. BLOCK_M = 32
  143. BLOCK_N = 32
  144. BLOCK_K = 16
  145. EVEN_K = K % BLOCK_K == 0
  146. ADD_INPUTS = add_inputs
  147. CAST_TYPE = False
  148. if inputs.dtype == torch.float32 and lora_b_weights.dtype in [
  149. torch.float16,
  150. torch.bfloat16,
  151. ]:
  152. CAST_TYPE = True
  153. grid = (
  154. triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N),
  155. batches,
  156. )
  157. _sgmv_expand_kernel[grid](
  158. inputs,
  159. lora_b_weights,
  160. output_tensor,
  161. N,
  162. K,
  163. b_seq_start_loc,
  164. seq_len_tensor,
  165. lora_indices_tensor,
  166. inputs.stride(0),
  167. inputs.stride(1),
  168. lora_b_weights.stride(0),
  169. lora_b_weights.stride(1),
  170. lora_b_weights.stride(2),
  171. output_tensor.stride(0),
  172. output_tensor.stride(1),
  173. BLOCK_M,
  174. BLOCK_N,
  175. BLOCK_K,
  176. EVEN_K,
  177. ADD_INPUTS,
  178. CAST_TYPE,
  179. )
  180. return
  181. try:
  182. sgmv_expand = torch.library.custom_op("lora::sgmv_expand",
  183. _sgmv_expand,
  184. mutates_args=["output_tensor"])
  185. except AttributeError:
  186. sgmv_expand = _sgmv_expand