_custom_ops.py 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730
  1. import contextlib
  2. import functools
  3. from typing import List, Optional, Tuple, Type, Union
  4. import torch
  5. from loguru import logger
  6. try:
  7. import aphrodite._C
  8. except ImportError as e:
  9. logger.warning(f"Failed to import from aphrodite._C with {e}")
  10. with contextlib.suppress(ImportError):
  11. import aphrodite._moe_C
  12. with contextlib.suppress(ImportError):
  13. # ruff: noqa: F401
  14. import aphrodite._punica_C
  15. def is_custom_op_supported(op_name: str) -> bool:
  16. op, overloads = torch._C._jit_get_operation(op_name)
  17. return op is not None
  18. def hint_on_error(fn):
  19. @functools.wraps(fn)
  20. def wrapper(*args, **kwargs):
  21. try:
  22. return fn(*args, **kwargs)
  23. except AttributeError as e:
  24. msg = (
  25. f"Error in calling custom op {fn.__name__}: {e}\n"
  26. f"Possibly you have built or installed an obsolete version of aphrodite.\n"
  27. f"Please try a clean build and install of aphrodite,"
  28. f"or remove old built files such as aphrodite/*.so and build/ ."
  29. )
  30. logger.error(msg)
  31. raise e
  32. return wrapper
  33. # activation ops
  34. def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
  35. torch.ops._C.silu_and_mul(out, x)
  36. def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
  37. torch.ops._C.gelu_and_mul(out, x)
  38. def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
  39. torch.ops._C.gelu_tanh_and_mul(out, x)
  40. def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
  41. torch.ops._C.gelu_fast(out, x)
  42. def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
  43. torch.ops._C.gelu_new(out, x)
  44. def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
  45. torch.ops._C.gelu_quick(out, x)
  46. # page attention ops
  47. def paged_attention_v1(
  48. out: torch.Tensor,
  49. query: torch.Tensor,
  50. key_cache: torch.Tensor,
  51. value_cache: torch.Tensor,
  52. num_kv_heads: int,
  53. scale: float,
  54. block_tables: torch.Tensor,
  55. seq_lens: torch.Tensor,
  56. block_size: int,
  57. max_seq_len: int,
  58. alibi_slopes: Optional[torch.Tensor],
  59. kv_cache_dtype: str,
  60. k_scale: float,
  61. v_scale: float,
  62. tp_rank: int = 0,
  63. blocksparse_local_blocks: int = 0,
  64. blocksparse_vert_stride: int = 0,
  65. blocksparse_block_size: int = 64,
  66. blocksparse_head_sliding_step: int = 0,
  67. ) -> None:
  68. torch.ops._C.paged_attention_v1(
  69. out, query, key_cache, value_cache, num_kv_heads, scale, block_tables,
  70. seq_lens, block_size, max_seq_len, alibi_slopes, kv_cache_dtype,
  71. k_scale, v_scale, tp_rank, blocksparse_local_blocks,
  72. blocksparse_vert_stride, blocksparse_block_size,
  73. blocksparse_head_sliding_step)
  74. def paged_attention_v2(
  75. out: torch.Tensor,
  76. exp_sum: torch.Tensor,
  77. max_logits: torch.Tensor,
  78. tmp_out: torch.Tensor,
  79. query: torch.Tensor,
  80. key_cache: torch.Tensor,
  81. value_cache: torch.Tensor,
  82. num_kv_heads: int,
  83. scale: float,
  84. block_tables: torch.Tensor,
  85. seq_lens: torch.Tensor,
  86. block_size: int,
  87. max_seq_len: int,
  88. alibi_slopes: Optional[torch.Tensor],
  89. kv_cache_dtype: str,
  90. k_scale: float,
  91. v_scale: float,
  92. tp_rank: int = 0,
  93. blocksparse_local_blocks: int = 0,
  94. blocksparse_vert_stride: int = 0,
  95. blocksparse_block_size: int = 64,
  96. blocksparse_head_sliding_step: int = 0,
  97. ) -> None:
  98. torch.ops._C.paged_attention_v2(
  99. out, exp_sum, max_logits, tmp_out, query, key_cache, value_cache,
  100. num_kv_heads, scale, block_tables, seq_lens, block_size, max_seq_len,
  101. alibi_slopes, kv_cache_dtype, k_scale, v_scale, tp_rank,
  102. blocksparse_local_blocks, blocksparse_vert_stride,
  103. blocksparse_block_size, blocksparse_head_sliding_step)
  104. # pos encoding ops
  105. def rotary_embedding(
  106. positions: torch.Tensor,
  107. query: torch.Tensor,
  108. key: torch.Tensor,
  109. head_size: int,
  110. cos_sin_cache: torch.Tensor,
  111. is_neox: bool,
  112. ) -> None:
  113. torch.ops._C.rotary_embedding(positions, query, key, head_size,
  114. cos_sin_cache, is_neox)
  115. def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor,
  116. key: torch.Tensor, head_size: int,
  117. cos_sin_cache: torch.Tensor, is_neox: bool,
  118. rot_dim: int,
  119. cos_sin_cache_offsets: torch.Tensor) -> None:
  120. torch.ops._C.batched_rotary_embedding(positions, query, key, head_size,
  121. cos_sin_cache, is_neox, rot_dim,
  122. cos_sin_cache_offsets)
  123. # layer norm ops
  124. def rms_norm(out: torch.Tensor, input: torch.Tensor, weight: torch.Tensor,
  125. epsilon: float) -> None:
  126. torch.ops._C.rms_norm(out, input, weight, epsilon)
  127. def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
  128. weight: torch.Tensor, epsilon: float) -> None:
  129. torch.ops._C.fused_add_rms_norm(input, residual, weight, epsilon)
  130. def advance_step(num_seqs: int, num_queries: int, block_size: int,
  131. input_tokens: torch.Tensor, sampled_token_ids: torch.Tensor,
  132. input_positions: torch.Tensor, seq_lens: torch.Tensor,
  133. slot_mapping: torch.Tensor,
  134. block_tables: torch.Tensor) -> None:
  135. """Advance a step on GPU for existing inputs for a multi-step runner"""
  136. return torch.ops._C.advance_step(num_seqs, num_queries, block_size,
  137. input_tokens, sampled_token_ids,
  138. input_positions, seq_lens, slot_mapping,
  139. block_tables)
  140. # quantization ops
  141. # awq
  142. def awq_dequantize(qweight: torch.Tensor, scales: torch.Tensor,
  143. zeros: torch.Tensor, split_k_iters: int, thx: int,
  144. thy: int) -> torch.Tensor:
  145. return torch.ops._C.awq_dequantize(qweight, scales, zeros, split_k_iters,
  146. thx, thy)
  147. def awq_gemm(input: torch.Tensor, qweight: torch.Tensor, qzeros: torch.Tensor,
  148. scales: torch.Tensor, split_k_iters: int) -> torch.Tensor:
  149. return torch.ops._C.awq_gemm(input, qweight, qzeros, scales, split_k_iters)
  150. # gptq
  151. def gptq_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
  152. b_gptq_qzeros: torch.Tensor, b_gptq_scales: torch.Tensor,
  153. b_g_idx: torch.Tensor, use_exllama: bool,
  154. bit: int) -> torch.Tensor:
  155. return torch.ops._C.gptq_gemm(a, b_q_weight, b_gptq_qzeros, b_gptq_scales,
  156. b_g_idx, use_exllama, bit)
  157. def gptq_shuffle(q_weight: torch.Tensor, q_perm: torch.Tensor,
  158. bit: int) -> None:
  159. torch.ops._C.gptq_shuffle(q_weight, q_perm, bit)
  160. # squeezellm
  161. def squeezellm_gemm(vec: torch.Tensor, mat: torch.Tensor, mul: torch.Tensor,
  162. lookup_table: torch.Tensor) -> None:
  163. torch.ops._C.squeezellm_gemm(vec, mat, mul, lookup_table)
  164. # marlin
  165. def marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
  166. b_scales: torch.Tensor, workspace: torch.Tensor, size_m: int,
  167. size_n: int, size_k: int) -> torch.Tensor:
  168. return torch.ops._C.marlin_gemm(a, b_q_weight, b_scales, workspace, size_m,
  169. size_n, size_k)
  170. # marlin_24
  171. def gptq_marlin_24_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
  172. b_meta: torch.Tensor, b_scales: torch.Tensor,
  173. workspace: torch.Tensor, num_bits: int, size_m: int,
  174. size_n: int, size_k: int) -> torch.Tensor:
  175. return torch.ops._C.gptq_marlin_24_gemm(a, b_q_weight, b_meta, b_scales,
  176. workspace, num_bits, size_m,
  177. size_n, size_k)
  178. # fp8 marlin
  179. def fp8_marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
  180. b_scales: torch.Tensor, workspace: torch.Tensor,
  181. num_bits: int, size_m: int, size_n: int,
  182. size_k: int) -> torch.Tensor:
  183. return torch.ops._C.fp8_marlin_gemm(a, b_q_weight, b_scales, workspace,
  184. num_bits, size_m, size_n, size_k)
  185. # cutlass
  186. def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
  187. return torch.ops._C.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
  188. def cutlass_scaled_mm(a: torch.Tensor,
  189. b: torch.Tensor,
  190. scale_a: torch.Tensor,
  191. scale_b: torch.Tensor,
  192. out_dtype: Type[torch.dtype],
  193. bias: Optional[torch.Tensor] = None) -> torch.Tensor:
  194. assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
  195. assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
  196. m = a.shape[0]
  197. n = b.shape[1]
  198. out = torch.empty((m, n), dtype=out_dtype, device=a.device)
  199. torch.ops._C.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)
  200. return out
  201. # aqlm
  202. def aqlm_gemm(input: torch.Tensor, codes: torch.Tensor,
  203. codebooks: torch.Tensor, scales: torch.Tensor,
  204. codebook_partition_sizes: torch.Tensor,
  205. bias: Optional[torch.Tensor]) -> torch.Tensor:
  206. return torch.ops._C.aqlm_gemm(input, codes, codebooks, scales,
  207. codebook_partition_sizes, bias)
  208. def aqlm_dequant(codes: torch.Tensor, codebooks: torch.Tensor,
  209. codebook_partition_sizes: torch.Tensor) -> torch.Tensor:
  210. return torch.ops._C.aqlm_dequant(codes, codebooks,
  211. codebook_partition_sizes)
  212. # gptq_marlin
  213. def gptq_marlin_repack(b_q_weight: torch.Tensor, perm: torch.Tensor,
  214. size_k: int, size_n: int,
  215. num_bits: int) -> torch.Tensor:
  216. return torch.ops._C.gptq_marlin_repack(b_q_weight, perm, size_k, size_n,
  217. num_bits)
  218. def awq_marlin_repack(b_q_weight: torch.Tensor, size_k: int, size_n: int,
  219. num_bits: int) -> torch.Tensor:
  220. return torch.ops._C.awq_marlin_repack(b_q_weight, size_k, size_n, num_bits)
  221. def gptq_marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
  222. b_scales: torch.Tensor, b_zeros: torch.Tensor,
  223. g_idx: torch.Tensor, perm: torch.Tensor,
  224. workspace: torch.Tensor, num_bits: int, size_m: int,
  225. size_n: int, size_k: int, is_k_full: bool,
  226. has_zp: bool) -> torch.Tensor:
  227. return torch.ops._C.gptq_marlin_gemm(a, b_q_weight, b_scales, b_zeros,
  228. g_idx, perm, workspace, num_bits,
  229. size_m, size_n, size_k, is_k_full,
  230. has_zp)
  231. # fp8
  232. def scaled_fp8_quant(
  233. input: torch.Tensor,
  234. scale: Optional[torch.Tensor] = None,
  235. batch_dim_padding: Optional[int] = None,
  236. scale_ub: Optional[torch.Tensor] = None,
  237. use_per_token_if_dynamic: bool = False,
  238. ) -> Tuple[torch.Tensor, torch.Tensor]:
  239. """
  240. Quantize input tensor to FP8 and return quantized tensor and scale.
  241. This function supports both static and dynamic quantization: If you
  242. provide the scale, it will use static scaling and if you omit it,
  243. the scale will be determined dynamically. The function also allows
  244. optional padding of the output tensor for downstream kernels that
  245. will benefit from padding.
  246. Args:
  247. input: The input tensor to be quantized to FP8
  248. scale: Optional scaling factor for the FP8 quantization
  249. batch_dim_padding: If specified, pad the first dimension
  250. of the output to at least this value.
  251. use_per_token_if_dynamic: Whether to do per_tensor or per_token
  252. in the dynamic quantization case.
  253. Returns:
  254. Tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and
  255. scaling factor.
  256. """
  257. if batch_dim_padding:
  258. shape = (max(batch_dim_padding, input.shape[0]), *input.shape[1:])
  259. output = torch.empty(shape,
  260. device=input.device,
  261. dtype=torch.float8_e4m3fn)
  262. else:
  263. output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
  264. if scale is None:
  265. if use_per_token_if_dynamic:
  266. scale = torch.empty((input.numel() // input.shape[-1], 1),
  267. device=input.device,
  268. dtype=torch.float32)
  269. torch.ops._C.dynamic_per_token_scaled_fp8_quant(
  270. output, input, scale, scale_ub)
  271. else:
  272. scale = torch.zeros(1, device=input.device, dtype=torch.float32)
  273. torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
  274. else:
  275. torch.ops._C.static_scaled_fp8_quant(output, input, scale)
  276. return output, scale
  277. # int8
  278. def scaled_int8_quant(
  279. input: torch.Tensor,
  280. scale: Optional[torch.Tensor] = None
  281. ) -> Tuple[torch.Tensor, torch.Tensor]:
  282. """
  283. Quantize the input tensor to int8 and return the quantized tensor and scale.
  284. Args:
  285. input: The input tensor to be quantized to int8.
  286. scale: Optional scaling factor for the int8 quantization.
  287. When not provided, we invoke dynamic-per-token quantization.
  288. Returns:
  289. Tuple[Torch.Tensor, Torch.Tensor] : Output int8 tensor and scales.
  290. """
  291. output = torch.empty_like(input, dtype=torch.int8)
  292. if scale is not None:
  293. # static-per-tensor quantization.
  294. torch.ops._C.static_scaled_int8_quant(output, input, scale)
  295. return output, scale
  296. # dynamic-per-token quantization.
  297. input_scales = torch.empty((input.numel() // input.shape[-1], 1),
  298. device=input.device,
  299. dtype=torch.float32)
  300. torch.ops._C.dynamic_scaled_int8_quant(output, input, input_scales)
  301. return output, input_scales
  302. # quip#
  303. def quip_gemv(
  304. A: torch.Tensor,
  305. B: torch.Tensor,
  306. CB: torch.Tensor,
  307. ) -> torch.Tensor:
  308. return torch.ops._C.quip_gemv(A, B, CB)
  309. def quip_decompress(
  310. YIs: torch.Tensor,
  311. CB: torch.Tensor,
  312. Y: torch.Tensor,
  313. ) -> torch.Tensor:
  314. return torch.ops._C.quip_decompress(YIs, CB, Y)
  315. # mamba
  316. def causal_conv1d_fwd(x: torch.Tensor, weight: torch.Tensor,
  317. bias_: Optional[torch.Tensor],
  318. seq_idx_: Optional[torch.Tensor],
  319. initial_states_: Optional[torch.Tensor],
  320. final_states_out_: Optional[torch.Tensor],
  321. silu_activation: bool) -> torch.Tensor:
  322. return torch.ops._C.causal_conv1d_fwd(x, weight, bias_, seq_idx_, None,
  323. initial_states_, final_states_out_,
  324. silu_activation)
  325. def causal_conv1d_update(x: torch.Tensor, conv_state: torch.Tensor,
  326. weight: torch.Tensor, bias_: Optional[torch.Tensor],
  327. silu_activation: bool) -> torch.Tensor:
  328. return torch.ops._C.causal_conv1d_update(x, conv_state, weight, bias_,
  329. silu_activation)
  330. def selective_scan_fwd(u: torch.Tensor, delta: torch.Tensor, A: torch.Tensor,
  331. B: torch.Tensor, C: torch.Tensor,
  332. D_: Optional[torch.Tensor], z_: Optional[torch.Tensor],
  333. delta_bias_: Optional[torch.Tensor],
  334. delta_softplus: bool, index_: Optional[torch.Tensor],
  335. x: Optional[torch.Tensor]) -> List[torch.Tensor]:
  336. return torch.ops._C.selective_scan_fwd(u, delta, A, B, C, D_, z_,
  337. delta_bias_, delta_softplus, index_,
  338. x)
  339. # moe
  340. def moe_align_block_size(topk_ids: torch.Tensor, num_experts: int,
  341. block_size: int, sorted_token_ids: torch.Tensor,
  342. experts_ids: torch.Tensor,
  343. num_tokens_post_pad: torch.Tensor) -> None:
  344. torch.ops._C.moe_align_block_size(topk_ids, num_experts, block_size,
  345. sorted_token_ids, experts_ids,
  346. num_tokens_post_pad)
  347. def topk_softmax(topk_weights: torch.Tensor, topk_ids: torch.Tensor,
  348. token_expert_indicies: torch.Tensor,
  349. gating_output: float) -> None:
  350. torch.ops._moe_C.topk_softmax(topk_weights, topk_ids,
  351. token_expert_indicies, gating_output)
  352. def reshape_and_cache(
  353. key: torch.Tensor,
  354. value: torch.Tensor,
  355. key_cache: torch.Tensor,
  356. value_cache: torch.Tensor,
  357. slot_mapping: torch.Tensor,
  358. kv_cache_dtype: str,
  359. k_scale: float,
  360. v_scale: float,
  361. ) -> None:
  362. torch.ops._C_cache_ops.reshape_and_cache(key, value, key_cache,
  363. value_cache, slot_mapping,
  364. kv_cache_dtype, k_scale, v_scale)
  365. def reshape_and_cache_flash(
  366. key: torch.Tensor,
  367. value: torch.Tensor,
  368. key_cache: torch.Tensor,
  369. value_cache: torch.Tensor,
  370. slot_mapping: torch.Tensor,
  371. kv_cache_dtype: str,
  372. ) -> None:
  373. torch.ops._C_cache_ops.reshape_and_cache_flash(key, value, key_cache,
  374. value_cache, slot_mapping,
  375. kv_cache_dtype)
  376. def copy_blocks(key_caches: List[torch.Tensor],
  377. value_caches: List[torch.Tensor],
  378. block_mapping: torch.Tensor) -> None:
  379. torch.ops._C_cache_ops.copy_blocks(key_caches, value_caches, block_mapping)
  380. def swap_blocks(src: torch.Tensor, dst: torch.Tensor,
  381. block_mapping: torch.Tensor) -> None:
  382. torch.ops._C_cache_ops.swap_blocks(src, dst, block_mapping)
  383. def convert_fp8(output: torch.Tensor,
  384. input: torch.Tensor,
  385. scale: float = 1.0,
  386. kv_dtype: str = "fp8") -> None:
  387. torch.ops._C_cache_ops.convert_fp8(output, input, scale, kv_dtype)
  388. def get_device_attribute(attribute: int, device: int) -> int:
  389. return torch.ops._C_cuda_utils.get_device_attribute(attribute, device)
  390. def get_max_shared_memory_per_block_device_attribute(device: int) -> int:
  391. # ruff: noqa: E501
  392. return torch.ops._C_cuda_utils.get_max_shared_memory_per_block_device_attribute(
  393. device)
  394. # custom ar
  395. def init_custom_ar(meta: torch.Tensor, rank_data: torch.Tensor,
  396. handles: List[str], offsets: List[int], rank: int,
  397. full_nvlink: bool) -> int:
  398. return torch.ops._C_custom_ar.init_custom_ar(meta, rank_data, handles,
  399. offsets, rank, full_nvlink)
  400. def should_custom_ar(inp: torch.Tensor, max_size: int, world_size: int,
  401. full_nvlink: bool) -> bool:
  402. return torch.ops._C_custom_ar.should_custom_ar(inp, max_size, world_size,
  403. full_nvlink)
  404. def all_reduce_reg(fa: int, inp: torch.Tensor, out: torch.Tensor) -> None:
  405. torch.ops._C_custom_ar.all_reduce_reg(fa, inp, out)
  406. def all_reduce_unreg(fa: int, inp: torch.Tensor, reg_buffer: torch.Tensor,
  407. out: torch.Tensor) -> None:
  408. torch.ops._C_custom_ar.all_reduce_unreg(fa, inp, reg_buffer, out)
  409. def dispose(fa: int) -> None:
  410. torch.ops._C_custom_ar.dispose(fa)
  411. def meta_size() -> int:
  412. return torch.ops._C_custom_ar.meta_size()
  413. def register_buffer(fa: int, t: torch.Tensor, handles: List[str],
  414. offsets: List[int]) -> None:
  415. return torch.ops._C_custom_ar.register_buffer(fa, t, handles, offsets)
  416. def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[str], List[int]]:
  417. return torch.ops._C_custom_ar.get_graph_buffer_ipc_meta(fa)
  418. def register_graph_buffers(fa: int, handles: List[str],
  419. offsets: List[List[int]]) -> None:
  420. torch.ops._C_custom_ar.register_graph_buffers(fa, handles, offsets)
  421. # punica
  422. def dispatch_bgmv(
  423. y: torch.Tensor,
  424. x: torch.Tensor,
  425. w_t_all: torch.Tensor,
  426. indicies: torch.Tensor,
  427. layer_idx: int,
  428. scale: float,
  429. ) -> None:
  430. torch.ops._punica_C.dispatch_bgmv(y, x, w_t_all, indicies, layer_idx,
  431. scale)
  432. def dispatch_bgmv_low_level(
  433. y: torch.Tensor,
  434. x: torch.Tensor,
  435. w_t_all: torch.Tensor,
  436. indicies: torch.Tensor,
  437. layer_idx: int,
  438. scale: float,
  439. h_in: int,
  440. h_out: int,
  441. y_offset: int,
  442. ) -> None:
  443. torch.ops._punica_C.dispatch_bgmv_low_level(
  444. y,
  445. x,
  446. w_t_all,
  447. indicies,
  448. layer_idx,
  449. scale,
  450. h_in,
  451. h_out,
  452. y_offset,
  453. )
  454. # Sampling Kernels
  455. def sampling_from_probs(probs: torch.Tensor,
  456. uniform_samplers: torch.Tensor,
  457. deterministic: bool = True,
  458. check_nan: bool = False) -> torch.Tensor:
  459. if check_nan and torch.any(torch.isnan(probs)):
  460. raise ValueError("NaN detected in probs")
  461. return torch.ops._C.sampling_from_probs(probs, uniform_samplers,
  462. deterministic)
  463. def _to_tensor_scalar_tuple(x):
  464. if isinstance(x, torch.Tensor):
  465. return (x, 0)
  466. else:
  467. return (None, x)
  468. def top_p_sampling_from_probs(
  469. probs: torch.Tensor,
  470. uniform_samples: torch.Tensor,
  471. top_p: Union[torch.Tensor, float],
  472. deterministic: bool = True,
  473. check_nan: bool = False) -> Tuple[torch.Tensor, torch.Tensor]:
  474. if check_nan and torch.any(torch.isnan(probs)):
  475. raise ValueError("NaN detected in probs")
  476. return torch.ops._C.top_p_sampling_from_probs(
  477. probs, uniform_samples, *_to_tensor_scalar_tuple(top_p), deterministic)
  478. def top_k_sampling_from_probs(
  479. probs: torch.Tensor,
  480. uniform_samples: torch.Tensor,
  481. top_k: Union[torch.Tensor, int],
  482. deterministic: bool = True,
  483. check_nan: bool = False) -> Tuple[torch.Tensor, torch.Tensor]:
  484. if check_nan and torch.any(torch.isnan(probs)):
  485. raise ValueError("NaN detected in probs")
  486. return torch.ops._C.top_k_sampling_from_probs(
  487. probs, uniform_samples, *_to_tensor_scalar_tuple(top_k), deterministic)
  488. def min_p_sampling_from_probs(
  489. probs: torch.Tensor,
  490. uniform_samples: torch.Tensor,
  491. min_p: Union[torch.Tensor, float],
  492. deterministic: bool = True,
  493. check_nan: bool = False) -> Tuple[torch.Tensor, torch.Tensor]:
  494. if check_nan and torch.any(torch.isnan(probs)):
  495. raise ValueError("NaN detected in probs")
  496. return torch.ops._C.min_p_sampling_from_probs(
  497. probs, uniform_samples, *_to_tensor_scalar_tuple(min_p), deterministic)
  498. def top_k_mask_logits(
  499. logits: torch.Tensor,
  500. top_k: Union[torch.Tensor, int],
  501. ) -> torch.Tensor:
  502. return torch.ops._C.top_k_mask_logits(logits,
  503. *_to_tensor_scalar_tuple(top_k))
  504. def top_p_renorm_prob(
  505. probs: torch.Tensor,
  506. top_p: Union[torch.Tensor, float],
  507. ) -> torch.Tensor:
  508. return torch.ops._C.top_p_renorm_prob(probs,
  509. *_to_tensor_scalar_tuple(top_p))
  510. def top_k_renorm_prob(
  511. probs: torch.Tensor,
  512. top_k: Union[torch.Tensor, int],
  513. ) -> torch.Tensor:
  514. return torch.ops._C.top_k_renorm_prob(probs,
  515. *_to_tensor_scalar_tuple(top_k))
  516. def top_k_top_p_sampling_from_logits(
  517. probs: torch.Tensor,
  518. uniform_samples: torch.Tensor,
  519. top_k: Union[torch.Tensor, int],
  520. top_p: Union[torch.Tensor, float],
  521. filter_apply_order: str = "top_k_first",
  522. deterministic: bool = True,
  523. check_nan: bool = False,
  524. ) -> Tuple[torch.Tensor, torch.Tensor]:
  525. if filter_apply_order == "top_k_first":
  526. masked_logits = top_k_mask_logits(probs, top_k)
  527. probs = torch.softmax(masked_logits, dim=-1)
  528. return top_p_sampling_from_probs(probs, uniform_samples, top_p,
  529. deterministic, check_nan)
  530. elif filter_apply_order == "joint":
  531. probs = torch.softmax(probs, dim=-1)
  532. if check_nan and torch.any(torch.isnan(probs)):
  533. raise ValueError("NaN detected in probs")
  534. return torch.ops._C.top_k_top_p_sampling_from_logits(
  535. probs, uniform_samples, *_to_tensor_scalar_tuple(top_k),
  536. *_to_tensor_scalar_tuple(top_p), deterministic)
  537. else:
  538. raise ValueError(f"Invalid filter_apply_order: {filter_apply_order}")
  539. def top_k_top_p_sampling_from_probs(
  540. probs: torch.Tensor,
  541. uniform_samples: torch.Tensor,
  542. top_k: Union[torch.Tensor, int],
  543. top_p: Union[torch.Tensor, float],
  544. filter_apply_order: str = "top_k_first",
  545. deterministic: bool = True,
  546. check_nan: bool = False,
  547. ) -> Tuple[torch.Tensor, torch.Tensor]:
  548. if filter_apply_order == "top_k_first":
  549. renorm_probs = top_k_renorm_prob(probs, top_k)
  550. return top_p_sampling_from_probs(renorm_probs, uniform_samples, top_p,
  551. deterministic, check_nan)
  552. elif filter_apply_order == "joint":
  553. if check_nan and torch.any(torch.isnan(probs)):
  554. raise ValueError("NaN detected in probs")
  555. return torch.ops._C.top_k_top_p_sampling_from_probs(
  556. probs, uniform_samples, *_to_tensor_scalar_tuple(top_k),
  557. *_to_tensor_scalar_tuple(top_p), deterministic)
  558. else:
  559. raise ValueError(f"Invalid filter_apply_order: {filter_apply_order}")
  560. # TODO: remove this later
  561. names_and_values = globals()
  562. names_and_values_to_update = {}
  563. # prepare variables to avoid dict size change during iteration
  564. k, v, arg = None, None, None
  565. fn_type = type(lambda x: x)
  566. for k, v in names_and_values.items():
  567. # find functions that are defined in this file and have torch.Tensor
  568. # in their annotations. `arg == "torch.Tensor"` is used to handle
  569. # the case when users use `import __annotations__` to turn type
  570. # hints into strings.
  571. if isinstance(v, fn_type) \
  572. and v.__code__.co_filename == __file__ \
  573. and any(arg is torch.Tensor or arg == "torch.Tensor"
  574. for arg in v.__annotations__.values()):
  575. names_and_values_to_update[k] = hint_on_error(v)
  576. names_and_values.update(names_and_values_to_update)
  577. del names_and_values_to_update, names_and_values, v, k, fn_type