4 月之前 · 22422d962b
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -218,7 +218,8 @@ if(APHRODITE_GPU_LANG STREQUAL "CUDA")
 
															     "kernels/quantization/gguf/gguf_kernel.cu"
														
 
															     "kernels/quantization/gptq_marlin/awq_marlin_repack.cu"
														
 
															     "kernels/quantization/fp8/fp8_marlin.cu"
														
 
															-    "kernels/all_reduce/custom_all_reduce.cu")
														
 
															+    "kernels/all_reduce/custom_all_reduce.cu"
														
 
															+    "kernels/sampling/sampling.cu")
														
 
															   # Add CUTLASS and GPTQ Marlin kernels if not MSVC
														
 
															   if(NOT MSVC)
														
--- a/aphrodite/_custom_ops.py
+++ b/aphrodite/_custom_ops.py
@@ -1,6 +1,6 @@
 
															 import contextlib
														
 
															 import functools
														
 
															-from typing import List, Optional, Tuple, Type
														
 
															+from typing import List, Optional, Tuple, Type, Union
														
 
															 import torch
														
 
															 from loguru import logger
														
@@ -632,6 +632,122 @@ def register_graph_buffers(fa: int, handles: List[str],
 
															     torch.ops._C_custom_ar.register_graph_buffers(fa, handles, offsets)
														
 
															+# Sampling Kernels
														
 
															+def sampling_from_probs(probs: torch.Tensor,
														
 
															+                        uniform_samplers: torch.Tensor,
														
 
															+                        deterministic: bool = True,
														
 
															+                        check_nan: bool = False) -> torch.Tensor:
														
 
															+    if check_nan and torch.any(torch.isnan(probs)):
														
 
															+        raise ValueError("NaN detected in probs")
														
 
															+    return torch.ops._C.sampling_from_probs(probs, uniform_samplers,
														
 
															+                                            deterministic)
														
 
															+
														
 
															+def _to_tensor_scalar_tuple(x):
														
 
															+    if isinstance(x, torch.Tensor):
														
 
															+        return (x, 0)
														
 
															+    else:
														
 
															+        return (None, x)
														
 
															+def top_p_sampling_from_probs(
														
 
															+        probs: torch.Tensor,
														
 
															+        uniform_samples: torch.Tensor,
														
 
															+        top_p: Union[torch.Tensor, float],
														
 
															+        deterministic: bool = True,
														
 
															+        check_nan: bool = False) -> Tuple[torch.Tensor, torch.Tensor]:
														
 
															+    if check_nan and torch.any(torch.isnan(probs)):
														
 
															+        raise ValueError("NaN detected in probs")
														
 
															+    return torch.ops._C.top_p_sampling_from_probs(
														
 
															+        probs, uniform_samples, *_to_tensor_scalar_tuple(top_p), deterministic)
														
 
															+
														
 
															+def top_k_sampling_from_probs(
														
 
															+        probs: torch.Tensor,
														
 
															+        uniform_samples: torch.Tensor,
														
 
															+        top_k: Union[torch.Tensor, int],
														
 
															+        deterministic: bool = True,
														
 
															+        check_nan: bool = False) -> Tuple[torch.Tensor, torch.Tensor]:
														
 
															+    if check_nan and torch.any(torch.isnan(probs)):
														
 
															+        raise ValueError("NaN detected in probs")
														
 
															+    return torch.ops._C.top_k_sampling_from_probs(
														
 
															+        probs, uniform_samples, *_to_tensor_scalar_tuple(top_k), deterministic)
														
 
															+
														
 
															+def min_p_sampling_from_probs(
														
 
															+        probs: torch.Tensor,
														
 
															+        uniform_samples: torch.Tensor,
														
 
															+        min_p: Union[torch.Tensor, float],
														
 
															+        deterministic: bool = True,
														
 
															+        check_nan: bool = False) -> Tuple[torch.Tensor, torch.Tensor]:
														
 
															+    if check_nan and torch.any(torch.isnan(probs)):
														
 
															+        raise ValueError("NaN detected in probs")
														
 
															+    return torch.ops._C.min_p_sampling_from_probs(
														
 
															+        probs, uniform_samples, *_to_tensor_scalar_tuple(min_p), deterministic)
														
 
															+
														
 
															+def top_k_mask_logits(
														
 
															+    logits: torch.Tensor,
														
 
															+    top_k: Union[torch.Tensor, int],
														
 
															+) -> torch.Tensor:
														
 
															+    return torch.ops._C.top_k_mask_logits(logits,
														
 
															+                                          *_to_tensor_scalar_tuple(top_k))
														
 
															+
														
 
															+def top_p_renorm_prob(
														
 
															+    probs: torch.Tensor,
														
 
															+    top_p: Union[torch.Tensor, float],
														
 
															+) -> torch.Tensor:
														
 
															+    return torch.ops._C.top_p_renorm_prob(probs,
														
 
															+                                          *_to_tensor_scalar_tuple(top_p))
														
 
															+
														
 
															+def top_k_renorm_prob(
														
 
															+    probs: torch.Tensor,
														
 
															+    top_k: Union[torch.Tensor, int],
														
 
															+) -> torch.Tensor:
														
 
															+    return torch.ops._C.top_k_renorm_prob(probs,
														
 
															+                                          *_to_tensor_scalar_tuple(top_k))
														
 
															+
														
 
															+def top_k_top_p_sampling_from_logits(
														
 
															+    probs: torch.Tensor,
														
 
															+    uniform_samples: torch.Tensor,
														
 
															+    top_k: Union[torch.Tensor, int],
														
 
															+    top_p: Union[torch.Tensor, float],
														
 
															+    filter_apply_order: str = "top_k_first",
														
 
															+    deterministic: bool = True,
														
 
															+    check_nan: bool = False,
														
 
															+) -> Tuple[torch.Tensor, torch.Tensor]:
														
 
															+    if filter_apply_order == "top_k_first":
														
 
															+        masked_logits = top_k_mask_logits(probs, top_k)
														
 
															+        probs = torch.softmax(masked_logits, dim=-1)
														
 
															+        return top_p_sampling_from_probs(probs, uniform_samples, top_p,
														
 
															+                                         deterministic, check_nan)
														
 
															+    elif filter_apply_order == "joint":
														
 
															+        probs = torch.softmax(probs, dim=-1)
														
 
															+        if check_nan and torch.any(torch.isnan(probs)):
														
 
															+            raise ValueError("NaN detected in probs")
														
 
															+        return torch.ops._C.top_k_top_p_sampling_from_logits(
														
 
															+            probs, uniform_samples, *_to_tensor_scalar_tuple(top_k),
														
 
															+            *_to_tensor_scalar_tuple(top_p), deterministic)
														
 
															+    else:
														
 
															+        raise ValueError(f"Invalid filter_apply_order: {filter_apply_order}")
														
 
															+
														
 
															+def top_k_top_p_sampling_from_probs(
														
 
															+    probs: torch.Tensor,
														
 
															+    uniform_samples: torch.Tensor,
														
 
															+    top_k: Union[torch.Tensor, int],
														
 
															+    top_p: Union[torch.Tensor, float],
														
 
															+    filter_apply_order: str = "top_k_first",
														
 
															+    deterministic: bool = True,
														
 
															+    check_nan: bool = False,
														
 
															+) -> Tuple[torch.Tensor, torch.Tensor]:
														
 
															+    if filter_apply_order == "top_k_first":
														
 
															+        renorm_probs = top_k_renorm_prob(probs, top_k)
														
 
															+        return top_p_sampling_from_probs(renorm_probs, uniform_samples, top_p,
														
 
															+                                         deterministic, check_nan)
														
 
															+    elif filter_apply_order == "joint":
														
 
															+        if check_nan and torch.any(torch.isnan(probs)):
														
 
															+            raise ValueError("NaN detected in probs")
														
 
															+        return torch.ops._C.top_k_top_p_sampling_from_probs(
														
 
															+            probs, uniform_samples, *_to_tensor_scalar_tuple(top_k),
														
 
															+            *_to_tensor_scalar_tuple(top_p), deterministic)
														
 
															+    else:
														
 
															+        raise ValueError(f"Invalid filter_apply_order: {filter_apply_order}")
														
 
															+
														
 
															+
														
 
															 # TODO: remove this later
														
 
															 names_and_values = globals()
														
 
															 names_and_values_to_update = {}
														
--- a/aphrodite/modeling/layers/sampler.py
+++ b/aphrodite/modeling/layers/sampler.py
@@ -1,11 +1,14 @@
 
															 """A layer that samples the next tokens from the model's outputs."""
														
 
															 import itertools
														
 
															+import os
														
 
															+import warnings
														
 
															 from math import inf
														
 
															 from typing import Dict, List, Optional, Tuple
														
 
															 import torch
														
 
															 import torch.nn as nn
														
 
															+import aphrodite._custom_ops as ops
														
 
															 from aphrodite.common.sampling_params import SamplingType
														
 
															 from aphrodite.common.sequence import (CompletionSequenceGroupOutput, Logprob,
														
 
															                                        PromptLogprobs, SampleLogprobs,
														
@@ -27,6 +30,11 @@ SampleResultType = List[Tuple[List[int], List[int]]]
 
															 # that this temperature well-uses the fp16 space after the logits are offset.
														
 
															 _TEMPERATURE_MINIMUM = 2e-5
														
 
															+# If enabled, we switch to a more performant implementation
														
 
															+# of top-k and top-p
														
 
															+APHRODITE_USE_SAMPLING_KERNELS = bool(int(
														
 
															+    os.getenv("APHRODITE_USE_SAMPLING_KERNELS", "0")))
														
 
															+
														
 
															 class Sampler(nn.Module):
														
 
															     """Samples the next tokens from the model's outputs.
														
@@ -155,7 +163,7 @@ class Sampler(nn.Module):
 
															         if do_nsigmas:
														
 
															             logits = _apply_top_nsigma(logits, sampling_tensors.nsigmas)
														
 
															-        if do_top_p_top_k:
														
 
															+        if do_top_p_top_k and not APHRODITE_USE_SAMPLING_KERNELS:
														
 
															             logits = _apply_top_k_top_p(logits, sampling_tensors.top_ps,
														
 
															                                         sampling_tensors.top_ks)
														
@@ -816,14 +824,7 @@ def _multinomial(
 
															     seq_groups: Optional[List[SequenceGroupToSample]] = None,
														
 
															 ) -> torch.Tensor:
														
 
															     if num_samples > 1:
														
 
															-        # This is equivalent to torch.repeat_interleaved (which also
														
 
															-        # forces a GPU<->CPU sync).
														
 
															-        # This allows us to do sampling with replacement by creating
														
 
															-        # num_samples copies of each row in the tensor, and then
														
 
															-        # batch sampling the resulting tensor.
														
 
															-        probs = probs[:, None, :].expand(probs.shape[0], num_samples,
														
 
															-                                         probs.shape[1]).contiguous().view(
														
 
															-                                             -1, probs.shape[1])
														
 
															+        probs = probs.repeat_interleave(num_samples, dim=0)
														
 
															     q = torch.empty_like(probs)
														
 
															     if seq_groups is None:
														
 
															         q.exponential_()
														
@@ -831,17 +832,57 @@ def _multinomial(
 
															         sample_idx = 0
														
 
															         for seq_group in seq_groups:
														
 
															             seq_ids = seq_group.seq_ids
														
 
															-            next_sample_idx = sample_idx + len(seq_ids) * num_samples
														
 
															-            q[sample_idx:next_sample_idx].exponential_(
														
 
															-                generator=seq_group.generator)
														
 
															-            sample_idx = next_sample_idx
														
 
															+            stride = len(seq_ids) * num_samples
														
 
															+            assert seq_group.generator is not None
														
 
															+            q[sample_idx:sample_idx +
														
 
															+              stride].exponential_(generator=seq_group.generator)
														
 
															+            sample_idx += stride
														
 
															     return probs.div_(q).argmax(dim=1).view(-1, num_samples)
														
 
															+def _top_k_top_p_multinomial_with_kernels(
														
 
															+        probs: torch.Tensor, top_ks: torch.Tensor, top_ps: torch.Tensor,
														
 
															+        num_samples: int, seq_groups: Optional[List[SequenceGroupToSample]]):
														
 
															+    max_top_k_round = 32
														
 
															+    if num_samples > 1:
														
 
															+        probs = probs.repeat_interleave(num_samples, dim=0)
														
 
															+        top_ks = top_ks.repeat_interleave(num_samples)
														
 
															+        top_ps = top_ps.repeat_interleave(num_samples)
														
 
															+    batch_size = probs.shape[0]
														
 
															+    uniform_samples = torch.empty((max_top_k_round, batch_size),
														
 
															+                                  device=probs.device)
														
 
															+    if seq_groups is None:
														
 
															+        uniform_samples.uniform_()
														
 
															+    else:
														
 
															+        sample_idx = 0
														
 
															+        for seq_group in seq_groups:
														
 
															+            seq_ids = seq_group.seq_ids
														
 
															+            stride = len(seq_ids) * num_samples
														
 
															+            assert seq_group.generator is not None
														
 
															+            uniform_samples[:, sample_idx:sample_idx +
														
 
															+                            stride].uniform_(generator=seq_group.generator)
														
 
															+            sample_idx += stride
														
 
															+    batch_next_token_ids, success = ops.top_k_top_p_sampling_from_probs(
														
 
															+        probs,
														
 
															+        uniform_samples,
														
 
															+        top_ks,
														
 
															+        top_ps,
														
 
															+    )
														
 
															+    if not success.all():
														
 
															+        warnings.warn("CUDA rejection sampling failed, fallback.",
														
 
															+                      stacklevel=1)
														
 
															+        probs = ops.top_k_renorm_prob(probs, top_ks)
														
 
															+        probs = ops.top_p_renorm_prob(probs, top_ps)
														
 
															+        batch_next_token_ids = ops.sampling_from_probs(
														
 
															+            probs, uniform_samples[0])
														
 
															+    return batch_next_token_ids.view(-1, num_samples)
														
 
															+
														
 
															+
														
 
															 def _sample_with_torch(
														
 
															     probs: torch.Tensor,
														
 
															     logprobs: torch.Tensor,
														
 
															     sampling_metadata: SamplingMetadata,
														
 
															+    sampling_tensors: SamplingTensors,
														
 
															     include_gpu_probs_tensor: bool,
														
 
															     modify_greedy_probs: bool,
														
 
															 ) -> Tuple[List[Tuple[List[int], List[int]]], Optional[torch.Tensor]]:
														
@@ -897,17 +938,29 @@ def _sample_with_torch(
 
															                     sampling_params = seq_group.sampling_params
														
 
															                     max_best_of_in_batch = max(max_best_of_in_batch,
														
 
															                                                sampling_params.best_of)
														
 
															-            seeded_args = {} if sampling_type == SamplingType.RANDOM else {
														
 
															-                "seq_groups": seq_groups,
														
 
															-            }
														
 
															-            multinomial_samples[sampling_type] = _multinomial(
														
 
															-                probs[long_sample_indices], max_best_of_in_batch,
														
 
															-                **seeded_args)
														
 
															-            if include_gpu_probs_tensor:
														
 
															+            seq_groups_arg = (None if sampling_type == SamplingType.RANDOM else
														
 
															+                              seq_groups)
														
 
															+            if APHRODITE_USE_SAMPLING_KERNELS is not None:
														
 
															+                multinomial_samples[
														
 
															+                    sampling_type] = _top_k_top_p_multinomial_with_kernels(
														
 
															+                        probs[long_sample_indices],
														
 
															+                        sampling_tensors.top_ks[long_sample_indices],
														
 
															+                        sampling_tensors.top_ps[long_sample_indices],
														
 
															+                        max_best_of_in_batch,
														
 
															+                        seq_groups_arg,
														
 
															+                    )
														
 
															+            else:
														
 
															+                multinomial_samples[sampling_type] = _multinomial(
														
 
															+                    probs[long_sample_indices],
														
 
															+                    max_best_of_in_batch,
														
 
															+                    seq_groups=seq_groups_arg)
														
 
															+
														
 
															+            if sampled_token_ids_tensor is not None:
														
 
															                 # Store sampled tokens in output tensor.
														
 
															-                sampled_token_ids_tensor[
														
 
															-                    long_sample_indices] = multinomial_samples[sampling_type]
														
 
															+                sampled_token_ids_tensor[long_sample_indices] = \
														
 
															+                    multinomial_samples[sampling_type].to(torch.long)
														
 
															+
														
 
															         elif sampling_type == SamplingType.BEAM:
														
 
															             beam_search_logprobs = logprobs[sample_indices]
														
 
															         else:
														
@@ -1035,6 +1088,7 @@ def _sample(
 
															         probs,
														
 
															         logprobs,
														
 
															         sampling_metadata,
														
 
															+        sampling_tensors,
														
 
															         include_gpu_probs_tensor=include_gpu_probs_tensor,
														
 
															         modify_greedy_probs=modify_greedy_probs,
														
 
															     )
														
--- a/kernels/ops.h
+++ b/kernels/ops.h
@@ -102,4 +102,36 @@ at::Tensor causal_conv1d_fwd(const at::Tensor& x, const at::Tensor& weight,
 
															                              const c10::optional<at::Tensor>& initial_states_,
														
 
															                              const c10::optional<at::Tensor>& final_states_out_,
														
 
															                              bool silu_activation);
														
 
															+
														
 
															+// Sampling kernels
														
 
															+torch::Tensor sampling_from_probs(torch::Tensor probs,
														
 
															+                                  torch::Tensor uniform_samples,
														
 
															+                                  bool deterministic);
														
 
															+std::vector<torch::Tensor> top_p_sampling_from_probs(
														
 
															+    torch::Tensor probs, torch::Tensor uniform_samples,
														
 
															+    std::optional<torch::Tensor> maybe_top_p_arr, double top_p_val,
														
 
															+    bool deterministic);
														
 
															+std::vector<torch::Tensor> top_k_sampling_from_probs(
														
 
															+    torch::Tensor probs, torch::Tensor uniform_samples,
														
 
															+    std::optional<torch::Tensor> maybe_top_k_arr, int64_t top_k_val,
														
 
															+    bool deterministic);
														
 
															+std::vector<torch::Tensor> min_p_sampling_from_probs(
														
 
															+    torch::Tensor probs, torch::Tensor uniform_samples,
														
 
															+    std::optional<torch::Tensor> maybe_min_p_arr, double min_p_val,
														
 
															+    bool deterministic);
														
 
															+std::vector<torch::Tensor> top_k_top_p_sampling_from_probs(
														
 
															+    torch::Tensor probs, torch::Tensor uniform_samples,
														
 
															+    std::optional<torch::Tensor> maybe_top_k_arr, double top_k_val,
														
 
															+    std::optional<torch::Tensor> maybe_top_p_arr, double top_p_val,
														
 
															+    bool deterministic);
														
 
															+torch::Tensor top_p_renorm_prob(torch::Tensor probs,
														
 
															+                                std::optional<torch::Tensor> maybe_top_p_arr,
														
 
															+                                double top_p_val);
														
 
															+torch::Tensor top_k_renorm_prob(torch::Tensor probs,
														
 
															+                                std::optional<torch::Tensor> maybe_top_k_arr,
														
 
															+                                int64_t top_k_val);
														
 
															+torch::Tensor top_k_mask_logits(torch::Tensor logits,
														
 
															+                                std::optional<torch::Tensor> maybe_top_k_arr,
														
 
															+                                int64_t top_k_val);
														
 
															+
														
 
															 #endif
														
--- a/kernels/sampling/math.cuh
+++ b/kernels/sampling/math.cuh
@@ -0,0 +1,159 @@
 
															+/*
														
 
															+ * Copyright (c) 2024 by PygmalionAI team.
														
 
															+ * Copyright (c) 2023 by FlashInfer team.
														
 
															+ *
														
 
															+ * Licensed under the Apache License, Version 2.0 (the "License");
														
 
															+ * you may not use this file except in compliance with the License.
														
 
															+ * You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *   http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+#ifndef APHRODITE_MATH_CUH_
														
 
															+#define APHRODITE_MATH_CUH_
														
 
															+
														
 
															+#include <cuda_fp16.h>
														
 
															+#include <cuda_runtime.h>
														
 
															+
														
 
															+namespace aphrodite {
														
 
															+namespace math {
														
 
															+
														
 
															+// log2(e)
														
 
															+constexpr float log2e = 1.44269504088896340736f;
														
 
															+
														
 
															+__forceinline__ __device__ half2 uint32_as_half2(uint32_t x) {
														
 
															+  return *(half2*)&x;
														
 
															+}
														
 
															+
														
 
															+__forceinline__ __device__ uint32_t half2_as_uint32(half2 x) {
														
 
															+  return *(uint32_t*)&x;
														
 
															+}
														
 
															+
														
 
															+/*!
														
 
															+ * \brief Wrapper of PTX ex2.approx instruction, which computes 2^x
														
 
															+ * \param x input
														
 
															+ */
														
 
															+__forceinline__ __device__ float ptx_exp2(float x) {
														
 
															+  float y;
														
 
															+  asm volatile("ex2.approx.ftz.f32 %0, %1;" : "=f"(y) : "f"(x));
														
 
															+  return y;
														
 
															+}
														
 
															+
														
 
															+/*!
														
 
															+ * \brief Wrapper of PTX lg2.approx instruction, which computes log2(x)
														
 
															+ * \param x input
														
 
															+ */
														
 
															+__forceinline__ __device__ float ptx_log2(float x) {
														
 
															+  float y;
														
 
															+  asm volatile("lg2.approx.ftz.f32 %0, %1;" : "=f"(y) : "f"(x));
														
 
															+  return y;
														
 
															+}
														
 
															+
														
 
															+/*!
														
 
															+ * \brief Wrapper of PTX ex2.approx.f16x2 instruction, which computes 2^x
														
 
															+ * \param x input
														
 
															+ */
														
 
															+__forceinline__ __device__ half2 ptx_exp2(half2 x) {
														
 
															+  uint32_t y_u32;
														
 
															+  uint32_t x_u32 = half2_as_uint32(x);
														
 
															+  asm volatile("ex2.approx.f16x2 %0, %1;" : "=r"(y_u32) : "r"(x_u32));
														
 
															+  return uint32_as_half2(y_u32);
														
 
															+}
														
 
															+
														
 
															+/*!
														
 
															+ * \brief Wrapper of PTX ex2.approx.f16 instruction, which computes 2^x
														
 
															+ * \param x input
														
 
															+ */
														
 
															+__forceinline__ __device__ half ptx_exp2(half x) {
														
 
															+  ushort y_u16;
														
 
															+  asm volatile("ex2.approx.f16 %0, %1;"
														
 
															+               : "=h"(y_u16)
														
 
															+               : "h"(__half_as_ushort(x)));
														
 
															+  return __ushort_as_half(y_u16);
														
 
															+}
														
 
															+
														
 
															+/*!
														
 
															+ * \brief Wrapper of PTX rcp.approx instruction, which computes 1/x
														
 
															+ * \param x input
														
 
															+ */
														
 
															+__forceinline__ __device__ float ptx_rcp(float x) {
														
 
															+  float y;
														
 
															+  asm volatile("rcp.approx.ftz.f32 %0, %1;" : "=f"(y) : "f"(x));
														
 
															+  return y;
														
 
															+}
														
 
															+
														
 
															+/*!
														
 
															+ * \brief Wrapper of PTX shfl.sync.bfly instruction, which performs a butterfly
														
 
															+ * shuffle between threads in a warp. \param x The value in the source lane
														
 
															+ * \param lane_mask The mask to perform thread index xor with: y[i] <- x[i ^
														
 
															+ * delta]
														
 
															+ */
														
 
															+__forceinline__ __device__ float shfl_xor_sync(float x, int lane_mask) {
														
 
															+  float y;
														
 
															+  asm volatile("shfl.sync.bfly.b32 %0, %1, %2, 0x1f, 0xffffffff;"
														
 
															+               : "=f"(y)
														
 
															+               : "f"(x), "r"(lane_mask));
														
 
															+  return y;
														
 
															+}
														
 
															+
														
 
															+/*!
														
 
															+ * \brief Wrapper of PTX shfl.sync.bfly instruction on half2, which performs a
														
 
															+ * butterfly shuffle between threads in a warp. \param x The value in the source
														
 
															+ * lane \param lane_mask The mask to perform thread index xor with: y[i] <- x[i
														
 
															+ * ^ lane_mask]
														
 
															+ */
														
 
															+__forceinline__ __device__ half2 shfl_xor_sync(half2 x, int lane_mask) {
														
 
															+  return __shfl_xor_sync(0xffffffff, x, lane_mask);
														
 
															+}
														
 
															+
														
 
															+/*!
														
 
															+ * \brief Wrapper of PTX rsqrt approximation instruction, which computes
														
 
															+ * 1/sqrt(x) \param x input
														
 
															+ */
														
 
															+__forceinline__ __device__ float rsqrt(float x) {
														
 
															+  float y;
														
 
															+  asm volatile("rsqrt.approx.ftz.f32 %0, %1;" : "=f"(y) : "f"(x));
														
 
															+  return y;
														
 
															+}
														
 
															+
														
 
															+/*!
														
 
															+ * \brief Wrapper of PTX tanh.approx.f32 instruction, which computes tanh(x)
														
 
															+ * \param x input
														
 
															+ */
														
 
															+__forceinline__ __device__ float tanh(float x) {
														
 
															+  float y;
														
 
															+  asm volatile("tanh.approx.f32 %0, %1;" : "=f"(y) : "f"(x));
														
 
															+  return y;
														
 
															+}
														
 
															+
														
 
															+/*!
														
 
															+ * \brief Wrapper of PTX tanh.approx.f16x2 instruction, which computes tanh(x)
														
 
															+ * \param x input
														
 
															+ */
														
 
															+__forceinline__ __device__ half2 tanh(half2 x) {
														
 
															+  uint32_t y_u32;
														
 
															+  uint32_t x_u32 = half2_as_uint32(x);
														
 
															+  asm volatile("tanh.approx.f16x2 %0, %1;" : "=r"(y_u32) : "r"(x_u32));
														
 
															+  return uint32_as_half2(y_u32);
														
 
															+}
														
 
															+
														
 
															+/*!
														
 
															+ * \brief Wrapper of PTX tanh.approx.f16 instruction, which computes tanh(x)
														
 
															+ * \param x input
														
 
															+ */
														
 
															+__forceinline__ __device__ half tanh(half x) {
														
 
															+  ushort y_u16;
														
 
															+  asm volatile("tanh.approx.f16 %0, %1;"
														
 
															+               : "=h"(y_u16)
														
 
															+               : "h"(__half_as_ushort(x)));
														
 
															+  return __ushort_as_half(y_u16);
														
 
															+}
														
 
															+
														
 
															+}  // namespace math
														
 
															+}  // namespace aphrodite
														
 
															+#endif  // APHRODITE_MATH_CUH_
														
--- a/kernels/sampling/sampling.cu
+++ b/kernels/sampling/sampling.cu
@@ -0,0 +1,391 @@
 
															+/*
														
 
															+ * Copyright (c) 2024 by PygmalionAI team.
														
 
															+ * Copyright (c) 2024 by FlashInfer team.
														
 
															+ *
														
 
															+ * Licensed under the Apache License, Version 2.0 (the "License");
														
 
															+ * you may not use this file except in compliance with the License.
														
 
															+ * You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *   http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+#include <c10/cuda/CUDAStream.h>
														
 
															+
														
 
															+#include "sampling.cuh"
														
 
															+#include "../ops.h"
														
 
															+#include "utils.cuh"
														
 
															+
														
 
															+// Check utils
														
 
															+#define CUDA_CHECK(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
														
 
															+
														
 
															+#define CHECK_CONTIGUOUS(x) \
														
 
															+  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
														
 
															+
														
 
															+#define CHECK_INPUT(x) \
														
 
															+  CUDA_CHECK(x);       \
														
 
															+  CHECK_CONTIGUOUS(x)
														
 
															+
														
 
															+#define CHECK_EQ(a, b) \
														
 
															+  TORCH_CHECK((a) == (b), "CHECK_EQ(" #a ", " #b ") failed. ", a, " vs ", b)
														
 
															+
														
 
															+#define CHECK_GE(a, b) \
														
 
															+  TORCH_CHECK((a) >= (b), "CHECK_GE(" #a ", " #b ") failed. ", a, " vs ", b)
														
 
															+
														
 
															+#define CHECK_DIM(d, x) \
														
 
															+  TORCH_CHECK(x.dim() == d, #x " must be a " #d "D tensor")
														
 
															+
														
 
															+using namespace aphrodite;
														
 
															+
														
 
															+torch::Tensor sampling_from_probs(torch::Tensor probs,
														
 
															+                                  torch::Tensor uniform_samples,
														
 
															+                                  bool deterministic) {
														
 
															+  CHECK_INPUT(probs);
														
 
															+  CHECK_INPUT(uniform_samples);
														
 
															+  auto device = probs.device();
														
 
															+  CHECK_EQ(uniform_samples.device(), device);
														
 
															+  CHECK_DIM(2, probs);            // probs: (batch_size, vocab_size)
														
 
															+  CHECK_DIM(1, uniform_samples);  // uniform_samples: (batch_size)
														
 
															+  CHECK_EQ(probs.size(0), uniform_samples.size(0));
														
 
															+  unsigned int batch_size = probs.size(0);
														
 
															+  unsigned int vocab_size = probs.size(1);
														
 
															+  probs = probs.to(torch::kFloat32);
														
 
															+  uniform_samples = uniform_samples.to(torch::kFloat32);
														
 
															+
														
 
															+  cudaStream_t torch_current_stream =
														
 
															+      c10::cuda::getCurrentCUDAStream(device.index());
														
 
															+  auto samples =
														
 
															+      torch::empty({batch_size}, torch::dtype(torch::kInt32).device(device));
														
 
															+
														
 
															+  cudaError_t status = sampling::SamplingFromProb(
														
 
															+      static_cast<float*>(probs.data_ptr()),
														
 
															+      static_cast<float*>(uniform_samples.data_ptr()),
														
 
															+      static_cast<int*>(samples.data_ptr()), batch_size, vocab_size,
														
 
															+      deterministic, torch_current_stream);
														
 
															+  TORCH_CHECK(status == cudaSuccess,
														
 
															+              "SamplingFromProbs failed with error code " +
														
 
															+                  std::string(cudaGetErrorString(status)));
														
 
															+  return samples;
														
 
															+}
														
 
															+
														
 
															+std::vector<torch::Tensor> top_p_sampling_from_probs(
														
 
															+    torch::Tensor probs, torch::Tensor uniform_samples,
														
 
															+    std::optional<torch::Tensor> maybe_top_p_arr, double top_p_val,
														
 
															+    bool deterministic) {
														
 
															+  CHECK_INPUT(probs);
														
 
															+  CHECK_INPUT(uniform_samples);
														
 
															+  auto device = probs.device();
														
 
															+  CHECK_EQ(uniform_samples.device(), device);
														
 
															+  CHECK_DIM(2, probs);  // probs: (batch_size, vocab_size)
														
 
															+  CHECK_DIM(
														
 
															+      2, uniform_samples);  // uniform_samples: (max_top_p_rounds, batch_size)
														
 
															+  CHECK_EQ(probs.size(0), uniform_samples.size(1));
														
 
															+  unsigned int batch_size = probs.size(0);
														
 
															+  unsigned int vocab_size = probs.size(1);
														
 
															+  unsigned int max_top_p_rounds = uniform_samples.size(0);
														
 
															+  bool has_top_p_arr = maybe_top_p_arr.has_value();
														
 
															+  auto top_p_arr = maybe_top_p_arr.value_or(
														
 
															+      torch::empty({0}, torch::dtype(torch::kFloat32)));
														
 
															+  if (has_top_p_arr) {
														
 
															+    CHECK_INPUT(top_p_arr);
														
 
															+    CHECK_DIM(1, top_p_arr);  // top_p_arr: (batch_size,)
														
 
															+    CHECK_EQ(top_p_arr.size(0), batch_size);
														
 
															+    CHECK_EQ(top_p_arr.device(), device);
														
 
															+  }
														
 
															+  probs = probs.to(torch::kFloat32);
														
 
															+  uniform_samples = uniform_samples.to(torch::kFloat32);
														
 
															+  top_p_arr = top_p_arr.to(torch::kFloat32);
														
 
															+
														
 
															+  cudaStream_t torch_current_stream =
														
 
															+      c10::cuda::getCurrentCUDAStream(device.index());
														
 
															+  auto samples =
														
 
															+      torch::empty({batch_size}, torch::dtype(torch::kInt32).device(device));
														
 
															+  auto success =
														
 
															+      torch::empty({batch_size}, torch::dtype(torch::kBool).device(device));
														
 
															+
														
 
															+  cudaError_t status = sampling::TopPSamplingFromProb<float, int>(
														
 
															+      static_cast<float*>(probs.data_ptr()),
														
 
															+      static_cast<float*>(uniform_samples.data_ptr()),
														
 
															+      static_cast<int*>(samples.data_ptr()),
														
 
															+      static_cast<bool*>(success.data_ptr()),
														
 
															+      has_top_p_arr ? static_cast<float*>(top_p_arr.data_ptr()) : nullptr,
														
 
															+      batch_size, top_p_val, vocab_size, max_top_p_rounds, deterministic,
														
 
															+      torch_current_stream);
														
 
															+  TORCH_CHECK(status == cudaSuccess,
														
 
															+              "TopPSamplingFromProbs failed with error code " +
														
 
															+                  std::string(cudaGetErrorString(status)));
														
 
															+
														
 
															+  return {samples, success};
														
 
															+}
														
 
															+
														
 
															+std::vector<torch::Tensor> top_k_sampling_from_probs(
														
 
															+    torch::Tensor probs, torch::Tensor uniform_samples,
														
 
															+    std::optional<torch::Tensor> maybe_top_k_arr, int64_t top_k_val,
														
 
															+    bool deterministic) {
														
 
															+  CHECK_INPUT(probs);
														
 
															+  CHECK_INPUT(uniform_samples);
														
 
															+  auto device = probs.device();
														
 
															+  CHECK_EQ(uniform_samples.device(), device);
														
 
															+  CHECK_DIM(2, probs);  // probs: (batch_size, vocab_size)
														
 
															+  CHECK_DIM(
														
 
															+      2, uniform_samples);  // uniform_samples: (max_top_k_rounds, batch_size)
														
 
															+  CHECK_EQ(probs.size(0), uniform_samples.size(1));
														
 
															+  unsigned int batch_size = probs.size(0);
														
 
															+  unsigned int vocab_size = probs.size(1);
														
 
															+  unsigned int max_top_k_rounds = uniform_samples.size(0);
														
 
															+  bool has_top_k_arr = maybe_top_k_arr.has_value();
														
 
															+  auto top_k_arr =
														
 
															+      maybe_top_k_arr.value_or(torch::empty({0}, torch::dtype(torch::kInt32)));
														
 
															+  if (has_top_k_arr) {
														
 
															+    CHECK_INPUT(top_k_arr);
														
 
															+    CHECK_DIM(1, top_k_arr);  // top_k_arr: (batch_size,)
														
 
															+    CHECK_EQ(top_k_arr.size(0), batch_size);
														
 
															+    CHECK_EQ(top_k_arr.device(), device);
														
 
															+  }
														
 
															+  probs = probs.to(torch::kFloat32);
														
 
															+  uniform_samples = uniform_samples.to(torch::kFloat32);
														
 
															+  top_k_arr = top_k_arr.to(torch::kInt32);
														
 
															+
														
 
															+  cudaStream_t torch_current_stream =
														
 
															+      c10::cuda::getCurrentCUDAStream(device.index());
														
 
															+  auto samples =
														
 
															+      torch::empty({batch_size}, torch::dtype(torch::kInt32).device(device));
														
 
															+  auto success =
														
 
															+      torch::empty({batch_size}, torch::dtype(torch::kBool).device(device));
														
 
															+
														
 
															+  cudaError_t status = sampling::TopKSamplingFromProb<float, int>(
														
 
															+      static_cast<float*>(probs.data_ptr()),
														
 
															+      static_cast<float*>(uniform_samples.data_ptr()),
														
 
															+      static_cast<int*>(samples.data_ptr()),
														
 
															+      static_cast<bool*>(success.data_ptr()),
														
 
															+      has_top_k_arr ? static_cast<float*>(top_k_arr.data_ptr()) : nullptr,
														
 
															+      batch_size, top_k_val, vocab_size, max_top_k_rounds, deterministic,
														
 
															+      torch_current_stream);
														
 
															+  TORCH_CHECK(status == cudaSuccess,
														
 
															+              "TopKSamplingFromProbs failed with error code " +
														
 
															+                  std::string(cudaGetErrorString(status)));
														
 
															+
														
 
															+  return {samples, success};
														
 
															+}
														
 
															+
														
 
															+std::vector<torch::Tensor> min_p_sampling_from_probs(
														
 
															+    torch::Tensor probs, torch::Tensor uniform_samples,
														
 
															+    std::optional<torch::Tensor> maybe_min_p_arr, double min_p_val,
														
 
															+    bool deterministic) {
														
 
															+  CHECK_INPUT(probs);
														
 
															+  CHECK_INPUT(uniform_samples);
														
 
															+  auto device = probs.device();
														
 
															+  CHECK_EQ(uniform_samples.device(), device);
														
 
															+  CHECK_DIM(2, probs);            // probs: (batch_size, vocab_size)
														
 
															+  CHECK_DIM(2, uniform_samples);  // uniform_samples: (max_rounds, batch_size)
														
 
															+  unsigned int batch_size = probs.size(0);
														
 
															+  unsigned int vocab_size = probs.size(1);
														
 
															+  unsigned int max_rounds = uniform_samples.size(0);
														
 
															+  CHECK_EQ(uniform_samples.size(1), batch_size);
														
 
															+  bool has_min_p_arr = maybe_min_p_arr.has_value();
														
 
															+  auto min_p_arr = maybe_min_p_arr.value_or(
														
 
															+      torch::empty({0}, torch::dtype(torch::kFloat32)));
														
 
															+  if (has_min_p_arr) {
														
 
															+    CHECK_INPUT(min_p_arr);
														
 
															+    CHECK_DIM(1, min_p_arr);  // min_p_arr: (batch_size,)
														
 
															+    CHECK_EQ(min_p_arr.size(0), batch_size);
														
 
															+    CHECK_EQ(min_p_arr.device(), device);
														
 
															+  }
														
 
															+  min_p_arr = min_p_arr.to(torch::kFloat32);
														
 
															+  probs = probs.to(torch::kFloat32);
														
 
															+  uniform_samples = uniform_samples.to(torch::kFloat32);
														
 
															+
														
 
															+  cudaStream_t torch_current_stream =
														
 
															+      c10::cuda::getCurrentCUDAStream(device.index());
														
 
															+  auto samples =
														
 
															+      torch::empty({batch_size}, torch::dtype(torch::kInt32).device(device));
														
 
															+  auto success =
														
 
															+      torch::empty({batch_size}, torch::dtype(torch::kBool).device(device));
														
 
															+
														
 
															+  cudaError_t status = sampling::MinPSamplingFromProb<float, int>(
														
 
															+      static_cast<float*>(probs.data_ptr()),
														
 
															+      static_cast<float*>(uniform_samples.data_ptr()),
														
 
															+      has_min_p_arr ? static_cast<float*>(min_p_arr.data_ptr()) : nullptr,
														
 
															+      static_cast<int*>(samples.data_ptr()),
														
 
															+      static_cast<bool*>(success.data_ptr()), batch_size, min_p_val, vocab_size,
														
 
															+      max_rounds, deterministic, torch_current_stream);
														
 
															+  TORCH_CHECK(status == cudaSuccess,
														
 
															+              "MinPSamplingFromProb failed with error code " +
														
 
															+                  std::string(cudaGetErrorString(status)));
														
 
															+
														
 
															+  return {samples, success};
														
 
															+}
														
 
															+
														
 
															+std::vector<torch::Tensor> top_k_top_p_sampling_from_probs(
														
 
															+    torch::Tensor probs, torch::Tensor uniform_samples,
														
 
															+    std::optional<torch::Tensor> maybe_top_k_arr, double top_k_val,
														
 
															+    std::optional<torch::Tensor> maybe_top_p_arr, double top_p_val,
														
 
															+    bool deterministic) {
														
 
															+  CHECK_INPUT(probs);
														
 
															+  CHECK_INPUT(uniform_samples);
														
 
															+  auto device = probs.device();
														
 
															+  CHECK_EQ(uniform_samples.device(), device);
														
 
															+  CHECK_DIM(2, probs);            // probs: (batch_size, vocab_size)
														
 
															+  CHECK_DIM(2, uniform_samples);  // uniform_samples: (max_rounds, batch_size)
														
 
															+  unsigned int batch_size = probs.size(0);
														
 
															+  unsigned int vocab_size = probs.size(1);
														
 
															+  unsigned int max_rounds = uniform_samples.size(0);
														
 
															+  CHECK_EQ(uniform_samples.size(1), batch_size);
														
 
															+  bool has_top_k_arr = maybe_top_k_arr.has_value();
														
 
															+  auto top_k_arr =
														
 
															+      maybe_top_k_arr.value_or(torch::empty({0}, torch::dtype(torch::kInt32)));
														
 
															+  if (has_top_k_arr) {
														
 
															+    CHECK_INPUT(top_k_arr);
														
 
															+    CHECK_DIM(1, top_k_arr);  // top_k_arr: (batch_size,)
														
 
															+    CHECK_EQ(top_k_arr.size(0), batch_size);
														
 
															+    CHECK_EQ(top_k_arr.device(), device);
														
 
															+  }
														
 
															+  top_k_arr = top_k_arr.to(torch::kInt32);
														
 
															+  bool has_top_p_arr = maybe_top_p_arr.has_value();
														
 
															+  auto top_p_arr = maybe_top_p_arr.value_or(
														
 
															+      torch::empty({0}, torch::dtype(torch::kFloat32)));
														
 
															+  if (has_top_p_arr) {
														
 
															+    CHECK_INPUT(top_p_arr);
														
 
															+    CHECK_DIM(1, top_p_arr);  // top_p_arr: (batch_size,)
														
 
															+    CHECK_EQ(top_p_arr.size(0), batch_size);
														
 
															+    CHECK_EQ(top_p_arr.device(), device);
														
 
															+  }
														
 
															+  top_p_arr = top_p_arr.to(torch::kFloat32);
														
 
															+  probs = probs.to(torch::kFloat32);
														
 
															+  uniform_samples = uniform_samples.to(torch::kFloat32);
														
 
															+
														
 
															+  cudaStream_t torch_current_stream =
														
 
															+      c10::cuda::getCurrentCUDAStream(device.index());
														
 
															+  auto samples =
														
 
															+      torch::empty({batch_size}, torch::dtype(torch::kInt32).device(device));
														
 
															+  auto success =
														
 
															+      torch::empty({batch_size}, torch::dtype(torch::kBool).device(device));
														
 
															+
														
 
															+  cudaError_t status = sampling::TopKTopPSamplingFromProb<float, int>(
														
 
															+      static_cast<float*>(probs.data_ptr()),
														
 
															+      static_cast<float*>(uniform_samples.data_ptr()),
														
 
															+      has_top_k_arr ? static_cast<int*>(top_k_arr.data_ptr()) : nullptr,
														
 
															+      has_top_p_arr ? static_cast<float*>(top_p_arr.data_ptr()) : nullptr,
														
 
															+      static_cast<int*>(samples.data_ptr()),
														
 
															+      static_cast<bool*>(success.data_ptr()), batch_size, top_k_val, top_p_val,
														
 
															+      vocab_size, max_rounds, deterministic, torch_current_stream);
														
 
															+  TORCH_CHECK(status == cudaSuccess,
														
 
															+              "TopKTopPSamplingFromProbs failed with error code " +
														
 
															+                  std::string(cudaGetErrorString(status)));
														
 
															+
														
 
															+  return {samples, success};
														
 
															+}
														
 
															+
														
 
															+torch::Tensor top_p_renorm_prob(torch::Tensor probs,
														
 
															+                                std::optional<torch::Tensor> maybe_top_p_arr,
														
 
															+                                double top_p_val) {
														
 
															+  CHECK_INPUT(probs);
														
 
															+  auto device = probs.device();
														
 
															+  CHECK_DIM(2, probs);  // probs: (batch_size, vocab_size)
														
 
															+  unsigned int batch_size = probs.size(0);
														
 
															+  unsigned int vocab_size = probs.size(1);
														
 
															+  bool has_top_p_arr = maybe_top_p_arr.has_value();
														
 
															+  auto top_p_arr = maybe_top_p_arr.value_or(
														
 
															+      torch::empty({0}, torch::dtype(torch::kFloat32)));
														
 
															+  if (has_top_p_arr) {
														
 
															+    CHECK_INPUT(top_p_arr);
														
 
															+    CHECK_DIM(1, top_p_arr);  // top_p_arr: (batch_size,)
														
 
															+    CHECK_EQ(top_p_arr.size(0), batch_size);
														
 
															+    CHECK_EQ(top_p_arr.device(), device);
														
 
															+  }
														
 
															+  top_p_arr = top_p_arr.to(torch::kFloat32);
														
 
															+  probs = probs.to(torch::kFloat32);
														
 
															+
														
 
															+  cudaStream_t torch_current_stream =
														
 
															+      c10::cuda::getCurrentCUDAStream(device.index());
														
 
															+  auto renorm_probs = torch::empty(
														
 
															+      {batch_size, vocab_size}, torch::dtype(torch::kFloat32).device(device));
														
 
															+
														
 
															+  cudaError_t status = sampling::TopPRenormProb<float>(
														
 
															+      static_cast<float*>(probs.data_ptr()),
														
 
															+      static_cast<float*>(renorm_probs.data_ptr()),
														
 
															+      has_top_p_arr ? static_cast<float*>(top_p_arr.data_ptr()) : nullptr,
														
 
															+      batch_size, top_p_val, vocab_size, torch_current_stream);
														
 
															+  TORCH_CHECK(status == cudaSuccess,
														
 
															+              "TopPRenormProb failed with error code " +
														
 
															+                  std::string(cudaGetErrorString(status)));
														
 
															+  return renorm_probs;
														
 
															+}
														
 
															+
														
 
															+torch::Tensor top_k_renorm_prob(torch::Tensor probs,
														
 
															+                                std::optional<torch::Tensor> maybe_top_k_arr,
														
 
															+                                int64_t top_k_val) {
														
 
															+  CHECK_INPUT(probs);
														
 
															+  auto device = probs.device();
														
 
															+  CHECK_DIM(2, probs);  // probs: (batch_size, vocab_size)
														
 
															+  unsigned int batch_size = probs.size(0);
														
 
															+  unsigned int vocab_size = probs.size(1);
														
 
															+  bool has_top_k_arr = maybe_top_k_arr.has_value();
														
 
															+  auto top_k_arr =
														
 
															+      maybe_top_k_arr.value_or(torch::empty({0}, torch::dtype(torch::kInt32)));
														
 
															+  if (has_top_k_arr) {
														
 
															+    CHECK_INPUT(top_k_arr);
														
 
															+    CHECK_DIM(1, top_k_arr);  // top_k_arr: (batch_size,)
														
 
															+    CHECK_EQ(top_k_arr.size(0), batch_size);
														
 
															+    CHECK_EQ(top_k_arr.device(), device);
														
 
															+  }
														
 
															+  top_k_arr = top_k_arr.to(torch::kInt32);
														
 
															+  probs = probs.to(torch::kFloat32);
														
 
															+
														
 
															+  cudaStream_t torch_current_stream =
														
 
															+      c10::cuda::getCurrentCUDAStream(device.index());
														
 
															+  auto renorm_probs = torch::empty(
														
 
															+      {batch_size, vocab_size}, torch::dtype(torch::kFloat32).device(device));
														
 
															+
														
 
															+  cudaError_t status = sampling::TopKRenormProb<float>(
														
 
															+      static_cast<float*>(probs.data_ptr()),
														
 
															+      static_cast<float*>(renorm_probs.data_ptr()),
														
 
															+      has_top_k_arr ? static_cast<int*>(top_k_arr.data_ptr()) : nullptr,
														
 
															+      batch_size, top_k_val, vocab_size, torch_current_stream);
														
 
															+
														
 
															+  TORCH_CHECK(status == cudaSuccess,
														
 
															+              "TopKRenormProb failed with error code " +
														
 
															+                  std::string(cudaGetErrorString(status)));
														
 
															+  return renorm_probs;
														
 
															+}
														
 
															+
														
 
															+torch::Tensor top_k_mask_logits(torch::Tensor logits,
														
 
															+                                std::optional<torch::Tensor> maybe_top_k_arr,
														
 
															+                                int64_t top_k_val) {
														
 
															+  CHECK_INPUT(logits);
														
 
															+  auto device = logits.device();
														
 
															+  CHECK_DIM(2, logits);  // logits: (batch_size, vocab_size)
														
 
															+  unsigned int batch_size = logits.size(0);
														
 
															+  unsigned int vocab_size = logits.size(1);
														
 
															+  bool has_top_k_arr = maybe_top_k_arr.has_value();
														
 
															+  auto top_k_arr =
														
 
															+      maybe_top_k_arr.value_or(torch::empty({0}, torch::dtype(torch::kInt32)));
														
 
															+  if (has_top_k_arr) {
														
 
															+    CHECK_INPUT(top_k_arr);
														
 
															+    CHECK_DIM(1, top_k_arr);  // top_k_arr: (batch_size,)
														
 
															+    CHECK_EQ(top_k_arr.size(0), batch_size);
														
 
															+    CHECK_EQ(top_k_arr.device(), device);
														
 
															+  }
														
 
															+  top_k_arr = top_k_arr.to(torch::kInt32);
														
 
															+  logits = logits.to(torch::kFloat32);
														
 
															+
														
 
															+  cudaStream_t torch_current_stream =
														
 
															+      c10::cuda::getCurrentCUDAStream(device.index());
														
 
															+  auto mask_logits = torch::empty({batch_size, vocab_size},
														
 
															+                                  torch::dtype(torch::kFloat32).device(device));
														
 
															+
														
 
															+  cudaError_t status = sampling::TopKMaskLogits<float>(
														
 
															+      static_cast<float*>(logits.data_ptr()),
														
 
															+      static_cast<float*>(mask_logits.data_ptr()),
														
 
															+      has_top_k_arr ? static_cast<int*>(top_k_arr.data_ptr()) : nullptr,
														
 
															+      batch_size, top_k_val, vocab_size, torch_current_stream);
														
 
															+
														
 
															+  TORCH_CHECK(status == cudaSuccess,
														
 
															+              "TopKMaskLogits failed with error code " +
														
 
															+                  std::string(cudaGetErrorString(status)));
														
 
															+  return mask_logits;
														
 
															+}
														
--- a/kernels/sampling/sampling.cuh
+++ b/kernels/sampling/sampling.cuh
@@ -0,0 +1,1398 @@
 
															+/*
														
 
															+ * Copyright (c) 2024 by PygmalionAI team.
														
 
															+ * Copyright (c) 2024 by FlashInfer team.
														
 
															+ *
														
 
															+ * Licensed under the Apache License, Version 2.0 (the "License");
														
 
															+ * you may not use this file except in compliance with the License.
														
 
															+ * You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *   http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+#ifndef APHRODITE_SAMPLING_CUH_
														
 
															+#define APHRODITE_SAMPLING_CUH_
														
 
															+
														
 
															+#include <cub/block/block_adjacent_difference.cuh>
														
 
															+#include <cub/block/block_reduce.cuh>
														
 
															+#include <cub/block/block_scan.cuh>
														
 
															+#include <numeric>
														
 
															+
														
 
															+#include "math.cuh"
														
 
															+#include "utils.cuh"
														
 
															+#include "vec_dtypes.cuh"
														
 
															+
														
 
															+namespace aphrodite {
														
 
															+
														
 
															+namespace sampling {
														
 
															+
														
 
															+using namespace cub;
														
 
															+
														
 
															+#define DISPATCH_DETERMINISTIC(deterministic, DETERMINISTIC, ...) \
														
 
															+  if (deterministic) {                                            \
														
 
															+    constexpr bool DETERMINISTIC = true;                          \
														
 
															+    __VA_ARGS__                                                   \
														
 
															+  } else {                                                        \
														
 
															+    constexpr bool DETERMINISTIC = false;                         \
														
 
															+    __VA_ARGS__                                                   \
														
 
															+  }
														
 
															+
														
 
															+constexpr BlockScanAlgorithm SCAN_ALGO = BLOCK_SCAN_WARP_SCANS;
														
 
															+constexpr BlockReduceAlgorithm REDUCE_ALGO = BLOCK_REDUCE_WARP_REDUCTIONS;
														
 
															+
														
 
															+#if (__CUDACC_VER_MAJOR__ * 10000 + __CUDACC_VER_MINOR__ * 100 >= 120100)
														
 
															+  #define APHRODITE_CUB_SUBTRACTLEFT_DEFINED
														
 
															+#endif
														
 
															+
														
 
															+template <typename T>
														
 
															+struct Pair {
														
 
															+  T value;
														
 
															+  int count;
														
 
															+
														
 
															+  __device__ Pair operator+(const Pair& other) const {
														
 
															+    return {value + other.value, count + other.count};
														
 
															+  }
														
 
															+  __device__ Pair& operator+=(const Pair& other) {
														
 
															+    value += other.value;
														
 
															+    count += other.count;
														
 
															+    return *this;
														
 
															+  }
														
 
															+};
														
 
															+
														
 
															+struct BoolDiffOp {
														
 
															+  __device__ __forceinline__ bool operator()(const bool& lhs,
														
 
															+                                             const bool& rhs) const {
														
 
															+    return lhs != rhs;
														
 
															+  }
														
 
															+};
														
 
															+
														
 
															+template <typename T, uint32_t BLOCK_THREADS, BlockScanAlgorithm SCAN_ALGORITHM,
														
 
															+          BlockReduceAlgorithm REDUCE_ALGORITHM>
														
 
															+struct SamplingTempStorage {
														
 
															+  union {
														
 
															+    T deterministic_scan[BLOCK_THREADS / 32];
														
 
															+    typename BlockScan<T, BLOCK_THREADS, SCAN_ALGORITHM>::TempStorage scan;
														
 
															+    typename BlockReduce<T, BLOCK_THREADS, REDUCE_ALGORITHM>::TempStorage
														
 
															+        reduce;
														
 
															+    typename BlockReduce<Pair<T>, BLOCK_THREADS, REDUCE_ALGORITHM>::TempStorage
														
 
															+        reduce_pair;
														
 
															+    typename BlockAdjacentDifference<bool, BLOCK_THREADS>::TempStorage adj_diff;
														
 
															+  } block_prim;
														
 
															+  struct {
														
 
															+    int32_t sampled_id;
														
 
															+    union {
														
 
															+      T value;
														
 
															+      Pair<T> pair;
														
 
															+      T max_p;
														
 
															+    } block_aggregate;
														
 
															+  } data;
														
 
															+};
														
 
															+
														
 
															+/*!
														
 
															+ * \brief Deterministic inclusive scan implementation, use Belloch scan
														
 
															+ * algorithm. \note This implementation is slower than the cub::BlockScan, but
														
 
															+ * it is deterministic.
														
 
															+ */
														
 
															+template <uint32_t VEC_SIZE, uint32_t BLOCK_THREADS,
														
 
															+          BlockScanAlgorithm SCAN_ALGORITHM,
														
 
															+          BlockReduceAlgorithm REDUCE_ALGORITHM, typename T>
														
 
															+__device__ __forceinline__ void DeterministicInclusiveSum(
														
 
															+    const T* in_data, T* out_data,
														
 
															+    SamplingTempStorage<T, BLOCK_THREADS, SCAN_ALGORITHM, REDUCE_ALGORITHM>*
														
 
															+        temp_storage) {
														
 
															+  T* smem_prefix_sum = temp_storage->block_prim.deterministic_scan;
														
 
															+  T thread_data[VEC_SIZE];
														
 
															+  T thread_sum = 0;
														
 
															+#pragma unroll
														
 
															+  for (uint32_t i = 0; i < VEC_SIZE; ++i) {
														
 
															+    thread_sum += in_data[i];
														
 
															+    thread_data[i] = thread_sum;
														
 
															+  }
														
 
															+
														
 
															+  T thread_exclusive_prefix_sum = thread_sum;
														
 
															+
														
 
															+#pragma unroll
														
 
															+  for (uint32_t offset = 1; offset < 32; offset *= 2) {
														
 
															+    T tmp = __shfl_up_sync(0xffffffff, thread_exclusive_prefix_sum, offset);
														
 
															+    if ((threadIdx.x + 1) % (offset * 2) == 0) {
														
 
															+      thread_exclusive_prefix_sum += tmp;
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  T warp_sum = __shfl_sync(0xffffffff, thread_exclusive_prefix_sum,
														
 
															+                           threadIdx.x | 0xffffffff);
														
 
															+  if (threadIdx.x % 32 == 31) {
														
 
															+    thread_exclusive_prefix_sum = 0;
														
 
															+  }
														
 
															+
														
 
															+#pragma unroll
														
 
															+  for (uint32_t offset = 16; offset >= 1; offset /= 2) {
														
 
															+    T tmp = __shfl_xor_sync(0xffffffff, thread_exclusive_prefix_sum, offset);
														
 
															+    if ((threadIdx.x + 1) % (offset * 2) == 0) {
														
 
															+      thread_exclusive_prefix_sum = tmp + thread_exclusive_prefix_sum;
														
 
															+    }
														
 
															+    if ((threadIdx.x + 1) % (offset * 2) == offset) {
														
 
															+      thread_exclusive_prefix_sum = tmp;
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  smem_prefix_sum[threadIdx.x / 32] = warp_sum;
														
 
															+  __syncthreads();
														
 
															+
														
 
															+  if (threadIdx.x < 32) {
														
 
															+    T warp_exclusive_prefix_sum =
														
 
															+        (threadIdx.x < BLOCK_THREADS / 32) ? smem_prefix_sum[threadIdx.x] : 0;
														
 
															+
														
 
															+#pragma unroll
														
 
															+    for (uint32_t offset = 1; offset < 32; offset *= 2) {
														
 
															+      T tmp = __shfl_up_sync(0xffffffff, warp_exclusive_prefix_sum, offset);
														
 
															+      if ((threadIdx.x + 1) % (offset * 2) == 0) {
														
 
															+        warp_exclusive_prefix_sum += tmp;
														
 
															+      }
														
 
															+    }
														
 
															+
														
 
															+    if (threadIdx.x % 32 == 31) {
														
 
															+      warp_exclusive_prefix_sum = 0;
														
 
															+    }
														
 
															+
														
 
															+#pragma unroll
														
 
															+    for (uint32_t offset = 16; offset >= 1; offset /= 2) {
														
 
															+      T tmp = __shfl_xor_sync(0xffffffff, warp_exclusive_prefix_sum, offset);
														
 
															+      if ((threadIdx.x + 1) % (offset * 2) == 0) {
														
 
															+        warp_exclusive_prefix_sum = tmp + warp_exclusive_prefix_sum;
														
 
															+      }
														
 
															+      if ((threadIdx.x + 1) % (offset * 2) == offset) {
														
 
															+        warp_exclusive_prefix_sum = tmp;
														
 
															+      }
														
 
															+    }
														
 
															+    if (threadIdx.x < BLOCK_THREADS / 32) {
														
 
															+      smem_prefix_sum[threadIdx.x] = warp_exclusive_prefix_sum;
														
 
															+    }
														
 
															+  }
														
 
															+  __syncthreads();
														
 
															+
														
 
															+#pragma unroll
														
 
															+  for (uint32_t i = 0; i < VEC_SIZE; ++i) {
														
 
															+    out_data[i] = smem_prefix_sum[threadIdx.x / 32] +
														
 
															+                  thread_exclusive_prefix_sum + thread_data[i];
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															+template <uint32_t VEC_SIZE, uint32_t BLOCK_THREADS,
														
 
															+          BlockScanAlgorithm SCAN_ALGORITHM,
														
 
															+          BlockReduceAlgorithm REDUCE_ALGORITHM, bool DETERMINISTIC, typename T>
														
 
															+__device__ __forceinline__ void DeviceSamplingFromProb(
														
 
															+    uint32_t i, uint32_t d, T threshold, T u, vec_t<T, VEC_SIZE> prob_vec,
														
 
															+    T& aggregate,
														
 
															+    SamplingTempStorage<T, BLOCK_THREADS, SCAN_ALGORITHM, REDUCE_ALGORITHM>*
														
 
															+        temp_storage) {
														
 
															+  const uint32_t tx = threadIdx.x;
														
 
															+  T prob_greater_than_threshold[VEC_SIZE];
														
 
															+  T inclusive_cdf[VEC_SIZE];
														
 
															+  bool greater_than_u[VEC_SIZE], valid[VEC_SIZE];
														
 
															+#pragma unroll
														
 
															+  for (uint32_t j = 0; j < VEC_SIZE; ++j) {
														
 
															+    prob_greater_than_threshold[j] =
														
 
															+        (prob_vec[j] > threshold) ? prob_vec[j] : T(0);
														
 
															+    valid[j] =
														
 
															+        prob_vec[j] > threshold && (i * BLOCK_THREADS + tx) * VEC_SIZE < d;
														
 
															+  }
														
 
															+  T aggregate_local = BlockReduce<T, BLOCK_THREADS, REDUCE_ALGORITHM>(
														
 
															+                          temp_storage->block_prim.reduce)
														
 
															+                          .Sum<VEC_SIZE>(prob_greater_than_threshold);
														
 
															+  if (tx == 0) {
														
 
															+    temp_storage->data.block_aggregate.value = aggregate_local;
														
 
															+  }
														
 
															+  __syncthreads();
														
 
															+  aggregate_local = temp_storage->data.block_aggregate.value;
														
 
															+
														
 
															+  if (aggregate + aggregate_local > u) {
														
 
															+    if constexpr (DETERMINISTIC) {
														
 
															+      DeterministicInclusiveSum<VEC_SIZE, BLOCK_THREADS, SCAN_ALGORITHM,
														
 
															+                                REDUCE_ALGORITHM, T>(
														
 
															+          prob_greater_than_threshold, inclusive_cdf, temp_storage);
														
 
															+    } else {
														
 
															+      BlockScan<T, BLOCK_THREADS, SCAN_ALGORITHM>(temp_storage->block_prim.scan)
														
 
															+          .InclusiveSum<VEC_SIZE>(prob_greater_than_threshold, inclusive_cdf);
														
 
															+
														
 
															+      __syncthreads();
														
 
															+    }
														
 
															+
														
 
															+#pragma unroll
														
 
															+    for (uint32_t j = 0; j < VEC_SIZE; ++j) {
														
 
															+      greater_than_u[j] = inclusive_cdf[j] + aggregate > u;
														
 
															+    }
														
 
															+
														
 
															+    bool greater_than_u_diff[VEC_SIZE];
														
 
															+#ifdef APHRODITE_CUB_SUBTRACTLEFT_DEFINED
														
 
															+    BlockAdjacentDifference<bool, BLOCK_THREADS>(
														
 
															+        temp_storage->block_prim.adj_diff)
														
 
															+        .SubtractLeft<VEC_SIZE>(greater_than_u, greater_than_u_diff,
														
 
															+                                BoolDiffOp());
														
 
															+#else
														
 
															+    BlockAdjacentDifference<bool, BLOCK_THREADS>(
														
 
															+        temp_storage->block_prim.adj_diff)
														
 
															+        .FlagHeads<VEC_SIZE>(greater_than_u_diff, greater_than_u, BoolDiffOp(),
														
 
															+                             0);
														
 
															+#endif
														
 
															+    __syncthreads();
														
 
															+
														
 
															+#pragma unroll
														
 
															+    for (uint32_t j = 0; j < VEC_SIZE; ++j) {
														
 
															+      if (greater_than_u_diff[j] && valid[j]) {
														
 
															+        if constexpr (DETERMINISTIC) {
														
 
															+          temp_storage->data.sampled_id =
														
 
															+              (i * BLOCK_THREADS + tx) * VEC_SIZE + j;
														
 
															+        } else {
														
 
															+          // cub's block scan result might not be monotonic, so we need to find
														
 
															+          // the first element
														
 
															+          atomicMin(&(temp_storage->data.sampled_id),
														
 
															+                    (i * BLOCK_THREADS + tx) * VEC_SIZE + j);
														
 
															+        }
														
 
															+      }
														
 
															+    }
														
 
															+    __syncthreads();
														
 
															+  }
														
 
															+  aggregate += aggregate_local;
														
 
															+}
														
 
															+
														
 
															+template <uint32_t BLOCK_THREADS, BlockScanAlgorithm SCAN_ALGORITHM,
														
 
															+          BlockReduceAlgorithm REDUCE_ALGORITHM, uint32_t VEC_SIZE,
														
 
															+          bool DETERMINISTIC, typename DType, typename IdType>
														
 
															+__global__ void SamplingFromProbKernel(DType* probs, DType* uniform_samples,
														
 
															+                                       IdType* output, IdType* row_indices,
														
 
															+                                       uint32_t d) {
														
 
															+  const uint32_t bx = blockIdx.x, tx = threadIdx.x;
														
 
															+  const uint32_t row_idx = row_indices == nullptr ? bx : row_indices[bx];
														
 
															+
														
 
															+  extern __shared__ __align__(
														
 
															+      alignof(SamplingTempStorage<DType, BLOCK_THREADS, SCAN_ALGORITHM,
														
 
															+                                  REDUCE_ALGORITHM>)) uint8_t smem_sampling[];
														
 
															+  auto& temp_storage =
														
 
															+      reinterpret_cast<SamplingTempStorage<DType, BLOCK_THREADS, SCAN_ALGORITHM,
														
 
															+                                           REDUCE_ALGORITHM>&>(smem_sampling);
														
 
															+  temp_storage.data.sampled_id = d - 1;
														
 
															+  __syncthreads();
														
 
															+
														
 
															+  vec_t<DType, VEC_SIZE> probs_vec;
														
 
															+  DType aggregate(0);
														
 
															+  float u = uniform_samples[bx];
														
 
															+
														
 
															+  for (uint32_t i = 0; i < ceil_div(d, BLOCK_THREADS * VEC_SIZE); ++i) {
														
 
															+    probs_vec.fill(DType(0));
														
 
															+    if ((i * BLOCK_THREADS + tx) * VEC_SIZE < d) {
														
 
															+      probs_vec.load(probs + row_idx * d + i * BLOCK_THREADS * VEC_SIZE +
														
 
															+                     tx * VEC_SIZE);
														
 
															+    }
														
 
															+
														
 
															+    DeviceSamplingFromProb<VEC_SIZE, BLOCK_THREADS, SCAN_ALGORITHM,
														
 
															+                           REDUCE_ALGORITHM, DETERMINISTIC, DType>(
														
 
															+        i, d, DType(0), u, probs_vec, aggregate, &temp_storage);
														
 
															+    if (float(aggregate) > u) {
														
 
															+      break;
														
 
															+    }
														
 
															+  }
														
 
															+  output[bx] = temp_storage.data.sampled_id;
														
 
															+}
														
 
															+
														
 
															+template <uint32_t BLOCK_THREADS, BlockScanAlgorithm SCAN_ALGORITHM,
														
 
															+          BlockReduceAlgorithm REDUCE_ALGORITHM, uint32_t VEC_SIZE,
														
 
															+          bool DETERMINISTIC, typename DType, typename IdType>
														
 
															+__global__ void TopKSamplingFromProbKernel(DType* probs, DType* uniform_samples,
														
 
															+                                           IdType* output, bool* success,
														
 
															+                                           IdType* top_k_arr,
														
 
															+                                           uint32_t top_k_val, uint32_t d,
														
 
															+                                           uint32_t max_top_k_rounds) {
														
 
															+  const uint32_t batch_size = gridDim.x;
														
 
															+  const uint32_t bx = blockIdx.x, tx = threadIdx.x;
														
 
															+  uint32_t k = top_k_arr == nullptr ? top_k_val : top_k_arr[bx];
														
 
															+
														
 
															+  extern __shared__ __align__(
														
 
															+      alignof(SamplingTempStorage<DType, BLOCK_THREADS, SCAN_ALGORITHM,
														
 
															+                                  REDUCE_ALGORITHM>)) uint8_t smem_sampling[];
														
 
															+  auto& temp_storage =
														
 
															+      reinterpret_cast<SamplingTempStorage<DType, BLOCK_THREADS, SCAN_ALGORITHM,
														
 
															+                                           REDUCE_ALGORITHM>&>(smem_sampling);
														
 
															+
														
 
															+  vec_t<DType, VEC_SIZE> probs_vec;
														
 
															+  DType aggregate;
														
 
															+  DType q = DType(1);
														
 
															+  DType pivot = DType(0);
														
 
															+  IdType sampled_id;
														
 
															+  for (uint32_t round = 0; round < max_top_k_rounds; ++round) {
														
 
															+    temp_storage.data.sampled_id = d - 1;
														
 
															+    __syncthreads();
														
 
															+    DType u = uniform_samples[round * batch_size + bx] * q;
														
 
															+    aggregate = DType(0);
														
 
															+    for (uint32_t i = 0; i < ceil_div(d, BLOCK_THREADS * VEC_SIZE); ++i) {
														
 
															+      probs_vec.fill(DType(0));
														
 
															+      if ((i * BLOCK_THREADS + tx) * VEC_SIZE < d) {
														
 
															+        probs_vec.load(probs + bx * d + (i * BLOCK_THREADS + tx) * VEC_SIZE);
														
 
															+      }
														
 
															+
														
 
															+      DeviceSamplingFromProb<VEC_SIZE, BLOCK_THREADS, SCAN_ALGORITHM,
														
 
															+                             REDUCE_ALGORITHM, DETERMINISTIC, DType>(
														
 
															+          i, d, pivot, u, probs_vec, aggregate, &temp_storage);
														
 
															+      if (aggregate > u) {
														
 
															+        break;
														
 
															+      }
														
 
															+    }
														
 
															+    __syncthreads();
														
 
															+    sampled_id = temp_storage.data.sampled_id;
														
 
															+    pivot = max(pivot, probs[bx * d + sampled_id]);
														
 
															+
														
 
															+    Pair<DType> aggregate_gt_pivot{DType(0), 0};
														
 
															+    for (uint32_t i = 0; i < ceil_div(d, BLOCK_THREADS * VEC_SIZE); ++i) {
														
 
															+      probs_vec.fill(DType(0));
														
 
															+      if ((i * BLOCK_THREADS + tx) * VEC_SIZE < d) {
														
 
															+        probs_vec.load(probs + bx * d + (i * BLOCK_THREADS + tx) * VEC_SIZE);
														
 
															+      }
														
 
															+
														
 
															+      Pair<DType> probs_gt_pivot[VEC_SIZE];
														
 
															+#pragma unroll
														
 
															+      for (uint32_t j = 0; j < VEC_SIZE; ++j) {
														
 
															+        probs_gt_pivot[j] = {(probs_vec[j] > pivot) ? probs_vec[j] : DType(0),
														
 
															+                             (probs_vec[j] > pivot &&
														
 
															+                              (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d)};
														
 
															+      }
														
 
															+
														
 
															+      aggregate_gt_pivot +=
														
 
															+          BlockReduce<Pair<DType>, BLOCK_THREADS, REDUCE_ALGORITHM>(
														
 
															+              temp_storage.block_prim.reduce_pair)
														
 
															+              .Sum<VEC_SIZE>(probs_gt_pivot);
														
 
															+      if (tx == 0) {
														
 
															+        temp_storage.data.block_aggregate.pair = aggregate_gt_pivot;
														
 
															+      }
														
 
															+      __syncthreads();
														
 
															+    }
														
 
															+    q = temp_storage.data.block_aggregate.pair.value;
														
 
															+    if (temp_storage.data.block_aggregate.pair.count < k) {
														
 
															+      break;
														
 
															+    }
														
 
															+  }
														
 
															+  __syncthreads();
														
 
															+  if (tx == 0) {
														
 
															+    output[bx] = sampled_id;
														
 
															+    if (temp_storage.data.block_aggregate.pair.count >= k) {
														
 
															+      // failed to sample within MAX_TOP_P_ROUNDS
														
 
															+      if (success != nullptr) {
														
 
															+        success[bx] = false;
														
 
															+      }
														
 
															+    } else {
														
 
															+      if (success != nullptr) {
														
 
															+        success[bx] = true;
														
 
															+      }
														
 
															+    }
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															+template <uint32_t BLOCK_THREADS, BlockScanAlgorithm SCAN_ALGORITHM,
														
 
															+          BlockReduceAlgorithm REDUCE_ALGORITHM, uint32_t VEC_SIZE,
														
 
															+          bool DETERMINISTIC, typename DType, typename IdType>
														
 
															+__global__ void TopPSamplingFromProbKernel(DType* probs, DType* uniform_samples,
														
 
															+                                           IdType* output, bool* success,
														
 
															+                                           IdType* row_indices,
														
 
															+                                           float* top_p_arr, float top_p_val,
														
 
															+                                           uint32_t d,
														
 
															+                                           uint32_t max_top_p_rounds) {
														
 
															+  const uint32_t batch_size = gridDim.x;
														
 
															+  const uint32_t bx = blockIdx.x, tx = threadIdx.x;
														
 
															+  float top_p = (top_p_arr == nullptr) ? top_p_val : top_p_arr[bx];
														
 
															+
														
 
															+  const uint32_t row_idx = row_indices == nullptr ? bx : row_indices[bx];
														
 
															+
														
 
															+  extern __shared__ __align__(
														
 
															+      alignof(SamplingTempStorage<DType, BLOCK_THREADS, SCAN_ALGORITHM,
														
 
															+                                  REDUCE_ALGORITHM>)) uint8_t smem_sampling[];
														
 
															+  auto& temp_storage =
														
 
															+      reinterpret_cast<SamplingTempStorage<DType, BLOCK_THREADS, SCAN_ALGORITHM,
														
 
															+                                           REDUCE_ALGORITHM>&>(smem_sampling);
														
 
															+
														
 
															+  vec_t<DType, VEC_SIZE> probs_vec;
														
 
															+  DType aggregate;
														
 
															+  DType q = DType(1);
														
 
															+  DType pivot = DType(0);
														
 
															+  IdType sampled_id;
														
 
															+  for (uint32_t round = 0; round < max_top_p_rounds; ++round) {
														
 
															+    temp_storage.data.sampled_id = d - 1;
														
 
															+    __syncthreads();
														
 
															+    DType u = uniform_samples[round * batch_size + bx] * q;
														
 
															+    aggregate = DType(0);
														
 
															+    for (uint32_t i = 0; i < ceil_div(d, BLOCK_THREADS * VEC_SIZE); ++i) {
														
 
															+      probs_vec.fill(DType(0));
														
 
															+      if ((i * BLOCK_THREADS + tx) * VEC_SIZE < d) {
														
 
															+        probs_vec.load(probs + row_idx * d +
														
 
															+                       (i * BLOCK_THREADS + tx) * VEC_SIZE);
														
 
															+      }
														
 
															+
														
 
															+      DeviceSamplingFromProb<VEC_SIZE, BLOCK_THREADS, SCAN_ALGORITHM,
														
 
															+                             REDUCE_ALGORITHM, DETERMINISTIC, DType>(
														
 
															+          i, d, pivot, u, probs_vec, aggregate, &temp_storage);
														
 
															+      if (aggregate > u) {
														
 
															+        break;
														
 
															+      }
														
 
															+    }
														
 
															+    __syncthreads();
														
 
															+    sampled_id = temp_storage.data.sampled_id;
														
 
															+    pivot = max(pivot, probs[row_idx * d + sampled_id]);
														
 
															+
														
 
															+    DType aggregate_gt_pivot = DType(0);
														
 
															+    for (uint32_t i = 0; i < ceil_div(d, BLOCK_THREADS * VEC_SIZE); ++i) {
														
 
															+      probs_vec.fill(DType(0));
														
 
															+      if ((i * BLOCK_THREADS + tx) * VEC_SIZE < d) {
														
 
															+        probs_vec.load(probs + row_idx * d +
														
 
															+                       (i * BLOCK_THREADS + tx) * VEC_SIZE);
														
 
															+      }
														
 
															+
														
 
															+      DType probs_gt_pivot[VEC_SIZE];
														
 
															+#pragma unroll
														
 
															+      for (uint32_t j = 0; j < VEC_SIZE; ++j) {
														
 
															+        probs_gt_pivot[j] = (probs_vec[j] > pivot) ? probs_vec[j] : DType(0);
														
 
															+      }
														
 
															+
														
 
															+      aggregate_gt_pivot +=
														
 
															+          BlockReduce<DType, BLOCK_THREADS>(temp_storage.block_prim.reduce)
														
 
															+              .Sum<VEC_SIZE>(probs_gt_pivot);
														
 
															+      if (tx == 0) {
														
 
															+        temp_storage.data.block_aggregate.value = aggregate_gt_pivot;
														
 
															+      }
														
 
															+      __syncthreads();
														
 
															+    }
														
 
															+    q = temp_storage.data.block_aggregate.value;
														
 
															+    if (float(q) < top_p) {
														
 
															+      break;
														
 
															+    }
														
 
															+  }
														
 
															+  __syncthreads();
														
 
															+  if (tx == 0) {
														
 
															+    output[bx] = sampled_id;
														
 
															+    if (float(q) >= top_p) {
														
 
															+      // failed to sample within MAX_TOP_P_ROUNDS
														
 
															+      if (success != nullptr) {
														
 
															+        success[bx] = false;
														
 
															+      }
														
 
															+    } else {
														
 
															+      if (success != nullptr) {
														
 
															+        success[bx] = true;
														
 
															+      }
														
 
															+    }
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															+template <uint32_t BLOCK_THREADS, BlockScanAlgorithm SCAN_ALGORITHM,
														
 
															+          BlockReduceAlgorithm REDUCE_ALGORITHM, uint32_t VEC_SIZE,
														
 
															+          bool DETERMINISTIC, typename DType, typename IdType>
														
 
															+__global__ void MinPSamplingFromProbKernel(DType* probs, DType* uniform_samples,
														
 
															+                                           DType* min_p_arr, IdType* output,
														
 
															+                                           bool* success, float min_p_val,
														
 
															+                                           uint32_t d,
														
 
															+                                           uint32_t max_min_p_rounds) {
														
 
															+  const uint32_t batch_size = gridDim.x;
														
 
															+  const uint32_t bx = blockIdx.x, tx = threadIdx.x;
														
 
															+  DType p = (min_p_arr == nullptr) ? min_p_val : min_p_arr[bx];
														
 
															+
														
 
															+  extern __shared__ __align__(
														
 
															+      alignof(SamplingTempStorage<DType, BLOCK_THREADS, SCAN_ALGORITHM,
														
 
															+                                  REDUCE_ALGORITHM>)) uint8_t smem_sampling[];
														
 
															+  auto& temp_storage =
														
 
															+      reinterpret_cast<SamplingTempStorage<DType, BLOCK_THREADS, SCAN_ALGORITHM,
														
 
															+                                           REDUCE_ALGORITHM>&>(smem_sampling);
														
 
															+
														
 
															+  vec_t<DType, VEC_SIZE> probs_vec;
														
 
															+  DType aggregate;
														
 
															+  DType q = DType(1);
														
 
															+  DType pivot = DType(0);
														
 
															+
														
 
															+  DType max_p = 0;
														
 
															+  for (uint32_t i = 0; i < ceil_div(d, BLOCK_THREADS * VEC_SIZE); ++i) {
														
 
															+    probs_vec.fill(DType(0));
														
 
															+    if ((i * BLOCK_THREADS + tx) * VEC_SIZE < d) {
														
 
															+      probs_vec.load(probs + bx * d + (i * BLOCK_THREADS + tx) * VEC_SIZE);
														
 
															+    }
														
 
															+    DType probs_[VEC_SIZE];
														
 
															+#pragma unroll
														
 
															+    for (uint32_t j = 0; j < VEC_SIZE; ++j) {
														
 
															+      probs_[j] = probs_vec[j];
														
 
															+    }
														
 
															+    max_p = max(
														
 
															+        max_p, BlockReduce<DType, BLOCK_THREADS>(temp_storage.block_prim.reduce)
														
 
															+                   .Reduce<VEC_SIZE>(probs_, cub::Max()));
														
 
															+    __syncthreads();
														
 
															+  }
														
 
															+  if (tx == 0) {
														
 
															+    temp_storage.data.block_aggregate.max_p = max_p;
														
 
															+  }
														
 
															+  __syncthreads();
														
 
															+  DType scaled_p = temp_storage.data.block_aggregate.max_p * p;
														
 
															+
														
 
															+  IdType sampled_id;
														
 
															+  for (uint32_t round = 0; round < max_min_p_rounds; ++round) {
														
 
															+    temp_storage.data.sampled_id = d - 1;
														
 
															+    __syncthreads();
														
 
															+    DType u = uniform_samples[round * batch_size + bx] * q;
														
 
															+    aggregate = DType(0);
														
 
															+    for (uint32_t i = 0; i < ceil_div(d, BLOCK_THREADS * VEC_SIZE); ++i) {
														
 
															+      probs_vec.fill(DType(0));
														
 
															+      if ((i * BLOCK_THREADS + tx) * VEC_SIZE < d) {
														
 
															+        probs_vec.load(probs + bx * d + (i * BLOCK_THREADS + tx) * VEC_SIZE);
														
 
															+      }
														
 
															+
														
 
															+      DeviceSamplingFromProb<VEC_SIZE, BLOCK_THREADS, SCAN_ALGORITHM,
														
 
															+                             REDUCE_ALGORITHM, DETERMINISTIC, DType>(
														
 
															+          i, d, pivot, u, probs_vec, aggregate, &temp_storage);
														
 
															+      if (aggregate > u) {
														
 
															+        break;
														
 
															+      }
														
 
															+    }
														
 
															+    __syncthreads();
														
 
															+    sampled_id = temp_storage.data.sampled_id;
														
 
															+    pivot = max(pivot, probs[bx * d + sampled_id]);
														
 
															+    if (pivot >= scaled_p) {
														
 
															+      break;
														
 
															+    }
														
 
															+
														
 
															+    DType aggregate_gt_pivot = DType(0);
														
 
															+    for (uint32_t i = 0; i < ceil_div(d, BLOCK_THREADS * VEC_SIZE); ++i) {
														
 
															+      probs_vec.fill(DType(0));
														
 
															+      if ((i * BLOCK_THREADS + tx) * VEC_SIZE < d) {
														
 
															+        probs_vec.load(probs + bx * d + (i * BLOCK_THREADS + tx) * VEC_SIZE);
														
 
															+      }
														
 
															+
														
 
															+      DType probs_gt_pivot[VEC_SIZE];
														
 
															+#pragma unroll
														
 
															+      for (uint32_t j = 0; j < VEC_SIZE; ++j) {
														
 
															+        probs_gt_pivot[j] = (probs_vec[j] > pivot) ? probs_vec[j] : DType(0);
														
 
															+      }
														
 
															+
														
 
															+      aggregate_gt_pivot +=
														
 
															+          BlockReduce<DType, BLOCK_THREADS>(temp_storage.block_prim.reduce)
														
 
															+              .Sum<VEC_SIZE>(probs_gt_pivot);
														
 
															+      if (tx == 0) {
														
 
															+        temp_storage.data.block_aggregate.value = aggregate_gt_pivot;
														
 
															+      }
														
 
															+      __syncthreads();
														
 
															+    }
														
 
															+    q = temp_storage.data.block_aggregate.value;
														
 
															+  }
														
 
															+  __syncthreads();
														
 
															+  if (tx == 0) {
														
 
															+    output[bx] = sampled_id;
														
 
															+    if (pivot < scaled_p) {
														
 
															+      // failed to sample within MAX_ROUNDS
														
 
															+      if (success != nullptr) {
														
 
															+        success[bx] = false;
														
 
															+      }
														
 
															+    } else {
														
 
															+      if (success != nullptr) {
														
 
															+        success[bx] = true;
														
 
															+      }
														
 
															+    }
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															+template <uint32_t BLOCK_THREADS, BlockScanAlgorithm SCAN_ALGORITHM,
														
 
															+          BlockReduceAlgorithm REDUCE_ALGORITHM, uint32_t VEC_SIZE,
														
 
															+          bool DETERMINISTIC, typename DType, typename IdType>
														
 
															+__global__ void TopKTopPSamplingFromProbKernel(
														
 
															+    DType* probs, DType* uniform_samples, IdType* top_k_arr, DType* top_p_arr,
														
 
															+    IdType* output, bool* success, IdType top_k_val, DType top_p_val,
														
 
															+    uint32_t d, uint32_t max_rounds) {
														
 
															+  const uint32_t batch_size = gridDim.x;
														
 
															+  const uint32_t bx = blockIdx.x, tx = threadIdx.x;
														
 
															+  IdType k = top_k_arr == nullptr ? top_k_val : top_k_arr[bx];
														
 
															+  DType p = top_p_arr == nullptr ? top_p_val : top_p_arr[bx];
														
 
															+
														
 
															+  extern __shared__ __align__(
														
 
															+      alignof(SamplingTempStorage<DType, BLOCK_THREADS, SCAN_ALGORITHM,
														
 
															+                                  REDUCE_ALGORITHM>)) uint8_t smem_sampling[];
														
 
															+  auto& temp_storage =
														
 
															+      reinterpret_cast<SamplingTempStorage<DType, BLOCK_THREADS, SCAN_ALGORITHM,
														
 
															+                                           REDUCE_ALGORITHM>&>(smem_sampling);
														
 
															+
														
 
															+  vec_t<DType, VEC_SIZE> probs_vec;
														
 
															+  DType aggregate;
														
 
															+  DType q = DType(1);
														
 
															+  DType pivot = DType(0);
														
 
															+  IdType sampled_id;
														
 
															+  for (uint32_t round = 0; round < max_rounds; ++round) {
														
 
															+    temp_storage.data.sampled_id = d - 1;
														
 
															+    __syncthreads();
														
 
															+    DType u = uniform_samples[round * batch_size + bx] * q;
														
 
															+    aggregate = DType(0);
														
 
															+    for (uint32_t i = 0; i < ceil_div(d, BLOCK_THREADS * VEC_SIZE); ++i) {
														
 
															+      probs_vec.fill(DType(0));
														
 
															+      if ((i * BLOCK_THREADS + tx) * VEC_SIZE < d) {
														
 
															+        probs_vec.load(probs + bx * d + (i * BLOCK_THREADS + tx) * VEC_SIZE);
														
 
															+      }
														
 
															+
														
 
															+      DeviceSamplingFromProb<VEC_SIZE, BLOCK_THREADS, SCAN_ALGORITHM,
														
 
															+                             REDUCE_ALGORITHM, DETERMINISTIC, DType>(
														
 
															+          i, d, pivot, u, probs_vec, aggregate, &temp_storage);
														
 
															+      if (aggregate > u) {
														
 
															+        break;
														
 
															+      }
														
 
															+    }
														
 
															+    __syncthreads();
														
 
															+    sampled_id = temp_storage.data.sampled_id;
														
 
															+    pivot = max(pivot, probs[bx * d + sampled_id]);
														
 
															+
														
 
															+    Pair<DType> aggregate_gt_pivot{DType(0), 0};
														
 
															+    for (uint32_t i = 0; i < ceil_div(d, BLOCK_THREADS * VEC_SIZE); ++i) {
														
 
															+      probs_vec.fill(DType(0));
														
 
															+      if ((i * BLOCK_THREADS + tx) * VEC_SIZE < d) {
														
 
															+        probs_vec.load(probs + bx * d + (i * BLOCK_THREADS + tx) * VEC_SIZE);
														
 
															+      }
														
 
															+
														
 
															+      Pair<DType> probs_gt_pivot[VEC_SIZE];
														
 
															+#pragma unroll
														
 
															+      for (uint32_t j = 0; j < VEC_SIZE; ++j) {
														
 
															+        probs_gt_pivot[j] = {(probs_vec[j] > pivot) ? probs_vec[j] : DType(0),
														
 
															+                             (probs_vec[j] > pivot &&
														
 
															+                              (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d)};
														
 
															+      }
														
 
															+
														
 
															+      aggregate_gt_pivot +=
														
 
															+          BlockReduce<Pair<DType>, BLOCK_THREADS, REDUCE_ALGORITHM>(
														
 
															+              temp_storage.block_prim.reduce_pair)
														
 
															+              .Sum<VEC_SIZE>(probs_gt_pivot);
														
 
															+      if (tx == 0) {
														
 
															+        temp_storage.data.block_aggregate.pair = aggregate_gt_pivot;
														
 
															+      }
														
 
															+      __syncthreads();
														
 
															+    }
														
 
															+    q = temp_storage.data.block_aggregate.pair.value;
														
 
															+    if (temp_storage.data.block_aggregate.pair.count < k && float(q) < p) {
														
 
															+      break;
														
 
															+    }
														
 
															+  }
														
 
															+  __syncthreads();
														
 
															+  if (tx == 0) {
														
 
															+    output[bx] = sampled_id;
														
 
															+    if (temp_storage.data.block_aggregate.pair.count >= k || float(q) >= p) {
														
 
															+      // failed to sample within MAX_TOP_P_ROUNDS
														
 
															+      if (success != nullptr) {
														
 
															+        success[bx] = false;
														
 
															+      }
														
 
															+    } else {
														
 
															+      if (success != nullptr) {
														
 
															+        success[bx] = true;
														
 
															+      }
														
 
															+    }
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															+template <typename T, typename IdType>
														
 
															+cudaError_t SamplingFromProb(T* probs, T* uniform_samples, IdType* output,
														
 
															+                             uint32_t batch_size, uint32_t d,
														
 
															+                             bool deterministic, cudaStream_t stream = 0) {
														
 
															+  constexpr uint32_t BLOCK_THREADS = 1024;
														
 
															+  const uint32_t vec_size = std::gcd(16 / sizeof(T), d);
														
 
															+  dim3 nblks(batch_size);
														
 
															+  dim3 nthrs(BLOCK_THREADS);
														
 
															+  IdType* row_indices_placeholder = nullptr;
														
 
															+  void* args[] = {&probs, &uniform_samples, &output, &row_indices_placeholder,
														
 
															+                  &d};
														
 
															+  const uint32_t smem_size =
														
 
															+      sizeof(SamplingTempStorage<T, BLOCK_THREADS, SCAN_ALGO, REDUCE_ALGO>);
														
 
															+
														
 
															+  DISPATCH_ALIGNED_VEC_SIZE(
														
 
															+      vec_size, VEC_SIZE,
														
 
															+      {DISPATCH_DETERMINISTIC(deterministic, DETERMINISTIC, {
														
 
															+        auto kernel =
														
 
															+            SamplingFromProbKernel<BLOCK_THREADS, SCAN_ALGO, REDUCE_ALGO,
														
 
															+                                   VEC_SIZE, DETERMINISTIC, T, IdType>;
														
 
															+        APHRODITE_CUDA_CALL(cudaLaunchKernel((void*)kernel, nblks, nthrs, args,
														
 
															+                                             smem_size, stream));
														
 
															+      })});
														
 
															+  return cudaSuccess;
														
 
															+}
														
 
															+
														
 
															+template <typename T, typename IdType>
														
 
															+cudaError_t ParallelSamplingFromProb(T* probs, T* uniform_samples,
														
 
															+                                     IdType* output, IdType* row_indices,
														
 
															+                                     uint32_t batch_size, uint32_t d,
														
 
															+                                     bool deterministic,
														
 
															+                                     cudaStream_t stream = 0) {
														
 
															+  constexpr uint32_t BLOCK_THREADS = 1024;
														
 
															+  const uint32_t vec_size = std::gcd(16 / sizeof(T), d);
														
 
															+  dim3 nblks(batch_size);
														
 
															+  dim3 nthrs(BLOCK_THREADS);
														
 
															+  void* args[] = {&probs, &uniform_samples, &output, &row_indices, &d};
														
 
															+  const uint32_t smem_size =
														
 
															+      sizeof(SamplingTempStorage<T, BLOCK_THREADS, SCAN_ALGO, REDUCE_ALGO>);
														
 
															+
														
 
															+  DISPATCH_ALIGNED_VEC_SIZE(
														
 
															+      vec_size, VEC_SIZE,
														
 
															+      {DISPATCH_DETERMINISTIC(deterministic, DETERMINISTIC, {
														
 
															+        auto kernel =
														
 
															+            SamplingFromProbKernel<BLOCK_THREADS, SCAN_ALGO, REDUCE_ALGO,
														
 
															+                                   VEC_SIZE, DETERMINISTIC, T, IdType>;
														
 
															+        APHRODITE_CUDA_CALL(cudaLaunchKernel((void*)kernel, nblks, nthrs, args,
														
 
															+                                             smem_size, stream));
														
 
															+      })});
														
 
															+  return cudaSuccess;
														
 
															+}
														
 
															+
														
 
															+template <typename T, typename IdType>
														
 
															+cudaError_t TopKSamplingFromProb(T* probs, T* uniform_samples, IdType* output,
														
 
															+                                 bool* success, T* top_k_arr,
														
 
															+                                 uint32_t batch_size, uint32_t top_k_val,
														
 
															+                                 uint32_t d, uint32_t max_top_k_rounds,
														
 
															+                                 bool deterministic, cudaStream_t stream = 0) {
														
 
															+  constexpr uint32_t BLOCK_THREADS = 1024;
														
 
															+  const uint32_t vec_size = std::gcd(16 / sizeof(T), d);
														
 
															+
														
 
															+  const uint32_t smem_size =
														
 
															+      sizeof(SamplingTempStorage<T, BLOCK_THREADS, SCAN_ALGO, REDUCE_ALGO>);
														
 
															+  dim3 nblks(batch_size);
														
 
															+  dim3 nthrs(BLOCK_THREADS);
														
 
															+  void* args[] = {&probs,     &uniform_samples, &output, &success,
														
 
															+                  &top_k_arr, &top_k_val,       &d,      &max_top_k_rounds};
														
 
															+
														
 
															+  DISPATCH_ALIGNED_VEC_SIZE(
														
 
															+      vec_size, VEC_SIZE,
														
 
															+      {DISPATCH_DETERMINISTIC(deterministic, DETERMINISTIC, {
														
 
															+        auto kernel =
														
 
															+            TopKSamplingFromProbKernel<BLOCK_THREADS, SCAN_ALGO, REDUCE_ALGO,
														
 
															+                                       VEC_SIZE, DETERMINISTIC, T, IdType>;
														
 
															+        APHRODITE_CUDA_CALL(cudaFuncSetAttribute(
														
 
															+            kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
														
 
															+        APHRODITE_CUDA_CALL(cudaLaunchKernel((void*)kernel, nblks, nthrs, args,
														
 
															+                                             smem_size, stream));
														
 
															+      })});
														
 
															+  return cudaSuccess;
														
 
															+}
														
 
															+
														
 
															+template <typename T, typename IdType>
														
 
															+cudaError_t TopPSamplingFromProb(T* probs, T* uniform_samples, IdType* output,
														
 
															+                                 bool* success, T* top_p_arr,
														
 
															+                                 uint32_t batch_size, T top_p_val, uint32_t d,
														
 
															+                                 uint32_t max_top_p_rounds, bool deterministic,
														
 
															+                                 cudaStream_t stream = 0) {
														
 
															+  constexpr uint32_t BLOCK_THREADS = 1024;
														
 
															+  const uint32_t vec_size = std::gcd(16 / sizeof(T), d);
														
 
															+
														
 
															+  const uint32_t smem_size =
														
 
															+      sizeof(SamplingTempStorage<T, BLOCK_THREADS, SCAN_ALGO, REDUCE_ALGO>);
														
 
															+  dim3 nblks(batch_size);
														
 
															+  dim3 nthrs(BLOCK_THREADS);
														
 
															+  IdType* row_indices_placeholder = nullptr;
														
 
															+  void* args[] = {&probs,
														
 
															+                  &uniform_samples,
														
 
															+                  &output,
														
 
															+                  &success,
														
 
															+                  &row_indices_placeholder,
														
 
															+                  &top_p_arr,
														
 
															+                  &top_p_val,
														
 
															+                  &d,
														
 
															+                  &max_top_p_rounds};
														
 
															+
														
 
															+  DISPATCH_ALIGNED_VEC_SIZE(
														
 
															+      vec_size, VEC_SIZE,
														
 
															+      {DISPATCH_DETERMINISTIC(deterministic, DETERMINISTIC, {
														
 
															+        auto kernel =
														
 
															+            TopPSamplingFromProbKernel<BLOCK_THREADS, SCAN_ALGO, REDUCE_ALGO,
														
 
															+                                       VEC_SIZE, DETERMINISTIC, T, IdType>;
														
 
															+        APHRODITE_CUDA_CALL(cudaFuncSetAttribute(
														
 
															+            kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
														
 
															+        APHRODITE_CUDA_CALL(cudaLaunchKernel((void*)kernel, nblks, nthrs, args,
														
 
															+                                             smem_size, stream));
														
 
															+      })});
														
 
															+  return cudaSuccess;
														
 
															+}
														
 
															+
														
 
															+template <typename T, typename IdType>
														
 
															+cudaError_t MinPSamplingFromProb(T* probs, T* uniform_samples, T* min_p_arr,
														
 
															+                                 IdType* output, bool* success,
														
 
															+                                 uint32_t batch_size, float min_p_val,
														
 
															+                                 uint32_t d, uint32_t max_rounds,
														
 
															+                                 bool deterministic, cudaStream_t stream = 0) {
														
 
															+  constexpr uint32_t BLOCK_THREADS = 1024;
														
 
															+  const uint32_t vec_size = std::gcd(16 / sizeof(T), d);
														
 
															+
														
 
															+  const uint32_t smem_size =
														
 
															+      sizeof(SamplingTempStorage<T, BLOCK_THREADS, SCAN_ALGO, REDUCE_ALGO>);
														
 
															+  dim3 nblks(batch_size);
														
 
															+  dim3 nthrs(BLOCK_THREADS);
														
 
															+  void* args[] = {&probs,   &uniform_samples, &min_p_arr, &output,
														
 
															+                  &success, &min_p_val,       &d,         &max_rounds};
														
 
															+
														
 
															+  DISPATCH_ALIGNED_VEC_SIZE(
														
 
															+      vec_size, VEC_SIZE,
														
 
															+      {DISPATCH_DETERMINISTIC(deterministic, DETERMINISTIC, {
														
 
															+        auto kernel =
														
 
															+            MinPSamplingFromProbKernel<BLOCK_THREADS, SCAN_ALGO, REDUCE_ALGO,
														
 
															+                                       VEC_SIZE, DETERMINISTIC, T, IdType>;
														
 
															+        APHRODITE_CUDA_CALL(cudaFuncSetAttribute(
														
 
															+            kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
														
 
															+        APHRODITE_CUDA_CALL(cudaLaunchKernel((void*)kernel, nblks, nthrs, args,
														
 
															+                                             smem_size, stream));
														
 
															+      })});
														
 
															+  return cudaSuccess;
														
 
															+}
														
 
															+
														
 
															+template <typename T, typename IdType>
														
 
															+cudaError_t TopKTopPSamplingFromProb(T* probs, T* uniform_samples,
														
 
															+                                     IdType* top_k_arr, T* top_p_arr,
														
 
															+                                     IdType* output, bool* success,
														
 
															+                                     uint32_t batch_size, IdType top_k_val,
														
 
															+                                     T top_p_val, uint32_t d,
														
 
															+                                     uint32_t max_rounds, bool deterministic,
														
 
															+                                     cudaStream_t stream = 0) {
														
 
															+  constexpr uint32_t BLOCK_THREADS = 1024;
														
 
															+  const uint32_t vec_size = std::gcd(16 / sizeof(T), d);
														
 
															+
														
 
															+  const uint32_t smem_size =
														
 
															+      sizeof(SamplingTempStorage<T, BLOCK_THREADS, SCAN_ALGO, REDUCE_ALGO>);
														
 
															+  dim3 nblks(batch_size);
														
 
															+  dim3 nthrs(BLOCK_THREADS);
														
 
															+  void* args[] = {&probs,  &uniform_samples, &top_k_arr, &top_p_arr,
														
 
															+                  &output, &success,         &top_k_val, &top_p_val,
														
 
															+                  &d,      &max_rounds};
														
 
															+
														
 
															+  DISPATCH_ALIGNED_VEC_SIZE(
														
 
															+      vec_size, VEC_SIZE,
														
 
															+      {DISPATCH_DETERMINISTIC(deterministic, DETERMINISTIC, {
														
 
															+        auto kernel = TopKTopPSamplingFromProbKernel<BLOCK_THREADS, SCAN_ALGO,
														
 
															+                                                     REDUCE_ALGO, VEC_SIZE,
														
 
															+                                                     DETERMINISTIC, T, IdType>;
														
 
															+        APHRODITE_CUDA_CALL(cudaFuncSetAttribute(
														
 
															+            kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
														
 
															+        APHRODITE_CUDA_CALL(cudaLaunchKernel((void*)kernel, nblks, nthrs, args,
														
 
															+                                             smem_size, stream));
														
 
															+      })});
														
 
															+  return cudaSuccess;
														
 
															+}
														
 
															+
														
 
															+template <typename T, uint32_t BLOCK_THREADS,
														
 
															+          BlockReduceAlgorithm REDUCE_ALGORITHM>
														
 
															+struct RenormTempStorage {
														
 
															+  union {
														
 
															+    typename BlockReduce<T, BLOCK_THREADS, REDUCE_ALGORITHM>::TempStorage
														
 
															+        reduce;
														
 
															+    typename BlockReduce<int, BLOCK_THREADS, REDUCE_ALGORITHM>::TempStorage
														
 
															+        reduce_int;
														
 
															+    typename BlockReduce<Pair<T>, BLOCK_THREADS, REDUCE_ALGORITHM>::TempStorage
														
 
															+        reduce_pair;
														
 
															+  } block_prim;
														
 
															+  struct {
														
 
															+    T max_val;
														
 
															+    T min_val;
														
 
															+    union {
														
 
															+      T value;
														
 
															+      int count;
														
 
															+      Pair<T> pair;
														
 
															+    } block_aggregate;
														
 
															+  } data;
														
 
															+};
														
 
															+
														
 
															+template <uint32_t BLOCK_THREADS, BlockReduceAlgorithm REDUCE_ALGORITHM,
														
 
															+          uint32_t VEC_SIZE, typename DType>
														
 
															+__global__ void TopPRenormProbKernel(DType* probs, DType* renormed_prob,
														
 
															+                                     DType* top_p_arr, float top_p_val,
														
 
															+                                     uint32_t d) {
														
 
															+  const uint32_t bx = blockIdx.x, tx = threadIdx.x;
														
 
															+  const uint32_t row_idx = bx;
														
 
															+  float p = top_p_arr == nullptr ? top_p_val : top_p_arr[bx];
														
 
															+
														
 
															+  extern __shared__ __align__(
														
 
															+      alignof(RenormTempStorage<DType, BLOCK_THREADS, REDUCE_ALGO>))
														
 
															+      uint8_t smem_renorm[];
														
 
															+  auto& temp_storage =
														
 
															+      reinterpret_cast<RenormTempStorage<DType, BLOCK_THREADS, REDUCE_ALGO>&>(
														
 
															+          smem_renorm);
														
 
															+  temp_storage.data.max_val = DType(0);
														
 
															+  vec_t<DType, VEC_SIZE> probs_vec;
														
 
															+  DType probs_greater_than_pivot[VEC_SIZE];  // pivot initialized to 0
														
 
															+
														
 
															+  DType threadlocal_max_val = DType(0);
														
 
															+  for (uint32_t i = 0; i < ceil_div(d, BLOCK_THREADS * VEC_SIZE); ++i) {
														
 
															+    probs_vec.fill(DType(0));
														
 
															+    if ((i * BLOCK_THREADS + tx) * VEC_SIZE < d) {
														
 
															+      probs_vec.load(probs + row_idx * d + i * BLOCK_THREADS * VEC_SIZE +
														
 
															+                     tx * VEC_SIZE);
														
 
															+    }
														
 
															+#pragma unroll
														
 
															+    for (uint32_t j = 0; j < VEC_SIZE; ++j) {
														
 
															+      probs_greater_than_pivot[j] = probs_vec[j];
														
 
															+    }
														
 
															+    threadlocal_max_val =
														
 
															+        max(threadlocal_max_val,
														
 
															+            BlockReduce<DType, BLOCK_THREADS, REDUCE_ALGORITHM>(
														
 
															+                temp_storage.block_prim.reduce)
														
 
															+                .Reduce<VEC_SIZE>(probs_greater_than_pivot, cub::Max()));
														
 
															+    __syncthreads();
														
 
															+  }
														
 
															+  if (tx == 0) {
														
 
															+    temp_storage.data.max_val = threadlocal_max_val;
														
 
															+  }
														
 
															+  __syncthreads();
														
 
															+  threadlocal_max_val = temp_storage.data.max_val;
														
 
															+
														
 
															+  float low = 0, high = threadlocal_max_val;
														
 
															+  DType min_gt_low, max_le_high;
														
 
															+  DType sum_low(1);
														
 
															+  // f(x) = sum(probs[probs > x]), f(x) is non-increasing
														
 
															+  // min_gt_low = min{p \in probs | p > low}, max_le_high = max{p \in probs | p
														
 
															+  // <= high} loop invariant:
														
 
															+  // - f(low) >= p, f(high) < p
														
 
															+  // - f(low) > f(min_gt_low) >= f(max_le_high) == f(high)
														
 
															+  // stopping condition
														
 
															+  // - f(low) >= p, f(min_gt_low) == f(max_le_high) == f(high) < p
														
 
															+  do {
														
 
															+    DType threadlocal_sum(0);
														
 
															+    float mid = (low + high) / 2;
														
 
															+    min_gt_low = high;
														
 
															+    max_le_high = low;
														
 
															+    for (uint32_t i = 0; i < ceil_div(d, BLOCK_THREADS * VEC_SIZE); ++i) {
														
 
															+      probs_vec.fill(DType(0));
														
 
															+      if ((i * BLOCK_THREADS + tx) * VEC_SIZE < d) {
														
 
															+        probs_vec.load(probs + row_idx * d + i * BLOCK_THREADS * VEC_SIZE +
														
 
															+                       tx * VEC_SIZE);
														
 
															+      }
														
 
															+#pragma unroll
														
 
															+      for (uint32_t j = 0; j < VEC_SIZE; ++j) {
														
 
															+        probs_greater_than_pivot[j] =
														
 
															+            (probs_vec[j] > mid) ? probs_vec[j] : DType(0);
														
 
															+        if (probs_vec[j] > low && (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d) {
														
 
															+          min_gt_low = min(min_gt_low, probs_vec[j]);
														
 
															+        }
														
 
															+        if (probs_vec[j] <= high &&
														
 
															+            (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d) {
														
 
															+          max_le_high = max(max_le_high, probs_vec[j]);
														
 
															+        }
														
 
															+      }
														
 
															+      threadlocal_sum += BlockReduce<DType, BLOCK_THREADS, REDUCE_ALGORITHM>(
														
 
															+                             temp_storage.block_prim.reduce)
														
 
															+                             .Sum<VEC_SIZE>(probs_greater_than_pivot);
														
 
															+      __syncthreads();
														
 
															+    }
														
 
															+    min_gt_low = BlockReduce<DType, BLOCK_THREADS, REDUCE_ALGORITHM>(
														
 
															+                     temp_storage.block_prim.reduce)
														
 
															+                     .Reduce(min_gt_low, cub::Min());
														
 
															+    __syncthreads();
														
 
															+    max_le_high = BlockReduce<DType, BLOCK_THREADS, REDUCE_ALGORITHM>(
														
 
															+                      temp_storage.block_prim.reduce)
														
 
															+                      .Reduce(max_le_high, cub::Max());
														
 
															+    if (tx == 0) {
														
 
															+      temp_storage.data.block_aggregate.value = threadlocal_sum;
														
 
															+      temp_storage.data.min_val = min_gt_low;
														
 
															+      temp_storage.data.max_val = max_le_high;
														
 
															+    }
														
 
															+    __syncthreads();
														
 
															+    threadlocal_sum = temp_storage.data.block_aggregate.value;
														
 
															+    min_gt_low = temp_storage.data.min_val;
														
 
															+    max_le_high = temp_storage.data.max_val;
														
 
															+    if (threadlocal_sum >= p) {
														
 
															+      low = mid;
														
 
															+      sum_low = float(threadlocal_sum);
														
 
															+    } else {
														
 
															+      high = min(mid, max_le_high);
														
 
															+    }
														
 
															+  } while (min_gt_low != max_le_high);
														
 
															+
														
 
															+  DType normalizer = math::ptx_rcp(max(sum_low, 1e-8));
														
 
															+
														
 
															+  // normalize
														
 
															+  for (uint32_t i = 0; i < ceil_div(d, BLOCK_THREADS * VEC_SIZE); ++i) {
														
 
															+    probs_vec.fill(DType(0));
														
 
															+    if ((i * BLOCK_THREADS + tx) * VEC_SIZE < d) {
														
 
															+      probs_vec.load(probs + row_idx * d + i * BLOCK_THREADS * VEC_SIZE +
														
 
															+                     tx * VEC_SIZE);
														
 
															+    }
														
 
															+#pragma unroll
														
 
															+    for (uint32_t j = 0; j < VEC_SIZE; ++j) {
														
 
															+      probs_vec[j] =
														
 
															+          (probs_vec[j] > low) ? probs_vec[j] * normalizer : DType(0);
														
 
															+    }
														
 
															+    if ((i * BLOCK_THREADS + tx) * VEC_SIZE < d) {
														
 
															+      probs_vec.store(renormed_prob + row_idx * d +
														
 
															+                      i * BLOCK_THREADS * VEC_SIZE + tx * VEC_SIZE);
														
 
															+    }
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															+template <uint32_t BLOCK_THREADS, BlockReduceAlgorithm REDUCE_ALGORITHM,
														
 
															+          uint32_t VEC_SIZE, typename DType, typename IdType>
														
 
															+__global__ void TopKMaskLogitsKernel(DType* logits, DType* masked_logits,
														
 
															+                                     IdType* top_k_arr, uint32_t top_k_val,
														
 
															+                                     uint32_t d) {
														
 
															+  const uint32_t bx = blockIdx.x, tx = threadIdx.x;
														
 
															+  const uint32_t row_idx = bx;
														
 
															+  uint32_t k = top_k_arr == nullptr ? top_k_val : top_k_arr[bx];
														
 
															+  float pivot = -std::numeric_limits<float>::infinity();
														
 
															+  vec_t<DType, VEC_SIZE> logits_vec;
														
 
															+  if (k < d) {
														
 
															+    extern __shared__ __align__(
														
 
															+        alignof(RenormTempStorage<DType, BLOCK_THREADS, REDUCE_ALGO>))
														
 
															+        uint8_t smem_renorm[];
														
 
															+    auto& temp_storage =
														
 
															+        reinterpret_cast<RenormTempStorage<DType, BLOCK_THREADS, REDUCE_ALGO>&>(
														
 
															+            smem_renorm);
														
 
															+    DType logits_greater_than_pivot[VEC_SIZE];  // pivot initialized to 0
														
 
															+
														
 
															+    DType threadlocal_max_val = DType(-std::numeric_limits<float>::infinity()),
														
 
															+          threadlocal_min_val = DType(std::numeric_limits<float>::infinity());
														
 
															+    for (uint32_t i = 0; i < ceil_div(d, BLOCK_THREADS * VEC_SIZE); ++i) {
														
 
															+      logits_vec.fill(DType(0));
														
 
															+      if ((i * BLOCK_THREADS + tx) * VEC_SIZE < d) {
														
 
															+        logits_vec.load(logits + row_idx * d + i * BLOCK_THREADS * VEC_SIZE +
														
 
															+                        tx * VEC_SIZE);
														
 
															+      }
														
 
															+#pragma unroll
														
 
															+      for (uint32_t j = 0; j < VEC_SIZE; ++j) {
														
 
															+        logits_greater_than_pivot[j] = logits_vec[j];
														
 
															+      }
														
 
															+      threadlocal_max_val =
														
 
															+          max(threadlocal_max_val,
														
 
															+              BlockReduce<DType, BLOCK_THREADS, REDUCE_ALGORITHM>(
														
 
															+                  temp_storage.block_prim.reduce)
														
 
															+                  .Reduce<VEC_SIZE>(logits_greater_than_pivot, cub::Max()));
														
 
															+      __syncthreads();
														
 
															+      threadlocal_min_val =
														
 
															+          min(threadlocal_min_val,
														
 
															+              BlockReduce<DType, BLOCK_THREADS, REDUCE_ALGORITHM>(
														
 
															+                  temp_storage.block_prim.reduce)
														
 
															+                  .Reduce<VEC_SIZE>(logits_greater_than_pivot, cub::Min()));
														
 
															+      __syncthreads();
														
 
															+    }
														
 
															+    if (tx == 0) {
														
 
															+      temp_storage.data.max_val = threadlocal_max_val;
														
 
															+      temp_storage.data.min_val = threadlocal_min_val;
														
 
															+    }
														
 
															+    __syncthreads();
														
 
															+    threadlocal_max_val = temp_storage.data.max_val;
														
 
															+    threadlocal_min_val = temp_storage.data.min_val;
														
 
															+
														
 
															+    float low = threadlocal_min_val - 1, high = threadlocal_max_val;
														
 
															+    DType min_gt_low, max_le_high;
														
 
															+    // f(x) = len(nonzero(probs > x)), f(x) is non-increasing
														
 
															+    // min_gt_low = min{p \in probs | p > low}, max_le_high = max{p \in probs |
														
 
															+    // p <= high} loop invariant:
														
 
															+    // - f(low) >= k, f(high) < k
														
 
															+    // - f(low) > f(min_gt_low) >= f(max_le_high) == f(high)
														
 
															+    // stopping condition: min_gt_low == max_le_high
														
 
															+    // - f(low) >= k, f(min_gt_low) == f(max_le_high) == f(high) < k
														
 
															+    do {
														
 
															+      int threadlocal_count_sum = 0;
														
 
															+      int probs_greater_than_pivot_count[VEC_SIZE];  // pivot initialized to 0
														
 
															+      float mid = (low + high) / 2;
														
 
															+      min_gt_low = high;
														
 
															+      max_le_high = low;
														
 
															+      for (uint32_t i = 0; i < ceil_div(d, BLOCK_THREADS * VEC_SIZE); ++i) {
														
 
															+        logits_vec.fill(DType(0));
														
 
															+        if ((i * BLOCK_THREADS + tx) * VEC_SIZE < d) {
														
 
															+          logits_vec.load(logits + row_idx * d + i * BLOCK_THREADS * VEC_SIZE +
														
 
															+                          tx * VEC_SIZE);
														
 
															+        }
														
 
															+#pragma unroll
														
 
															+        for (uint32_t j = 0; j < VEC_SIZE; ++j) {
														
 
															+          probs_greater_than_pivot_count[j] =
														
 
															+              logits_vec[j] > mid &&
														
 
															+              (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d;
														
 
															+          if (logits_vec[j] > low &&
														
 
															+              (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d) {
														
 
															+            min_gt_low = min(min_gt_low, logits_vec[j]);
														
 
															+          }
														
 
															+          if (logits_vec[j] <= high &&
														
 
															+              (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d) {
														
 
															+            max_le_high = max(max_le_high, logits_vec[j]);
														
 
															+          }
														
 
															+        }
														
 
															+        threadlocal_count_sum +=
														
 
															+            BlockReduce<int, BLOCK_THREADS, REDUCE_ALGORITHM>(
														
 
															+                temp_storage.block_prim.reduce_int)
														
 
															+                .Sum<VEC_SIZE>(probs_greater_than_pivot_count);
														
 
															+        __syncthreads();
														
 
															+      }
														
 
															+      min_gt_low = BlockReduce<DType, BLOCK_THREADS, REDUCE_ALGORITHM>(
														
 
															+                       temp_storage.block_prim.reduce)
														
 
															+                       .Reduce(min_gt_low, cub::Min());
														
 
															+      __syncthreads();
														
 
															+      max_le_high = BlockReduce<DType, BLOCK_THREADS, REDUCE_ALGORITHM>(
														
 
															+                        temp_storage.block_prim.reduce)
														
 
															+                        .Reduce(max_le_high, cub::Max());
														
 
															+      if (tx == 0) {
														
 
															+        temp_storage.data.block_aggregate.count = threadlocal_count_sum;
														
 
															+        temp_storage.data.min_val = min_gt_low;
														
 
															+        temp_storage.data.max_val = max_le_high;
														
 
															+      }
														
 
															+      __syncthreads();
														
 
															+      threadlocal_count_sum = temp_storage.data.block_aggregate.count;
														
 
															+      min_gt_low = temp_storage.data.min_val;
														
 
															+      max_le_high = temp_storage.data.max_val;
														
 
															+      if (threadlocal_count_sum >= k) {
														
 
															+        low = mid;
														
 
															+      } else {
														
 
															+        high = min(mid, max_le_high);
														
 
															+      }
														
 
															+    } while (min_gt_low != max_le_high);
														
 
															+    pivot = low;
														
 
															+  }
														
 
															+
														
 
															+  // masking
														
 
															+  for (uint32_t i = 0; i < ceil_div(d, BLOCK_THREADS * VEC_SIZE); ++i) {
														
 
															+    logits_vec.fill(DType(0));
														
 
															+    if ((i * BLOCK_THREADS + tx) * VEC_SIZE < d) {
														
 
															+      logits_vec.load(logits + row_idx * d + i * BLOCK_THREADS * VEC_SIZE +
														
 
															+                      tx * VEC_SIZE);
														
 
															+    }
														
 
															+#pragma unroll
														
 
															+    for (uint32_t j = 0; j < VEC_SIZE; ++j) {
														
 
															+      logits_vec[j] = (logits_vec[j] > pivot)
														
 
															+                          ? logits_vec[j]
														
 
															+                          : DType(-std::numeric_limits<float>::infinity());
														
 
															+    }
														
 
															+    if ((i * BLOCK_THREADS + tx) * VEC_SIZE < d) {
														
 
															+      logits_vec.store(masked_logits + row_idx * d +
														
 
															+                       i * BLOCK_THREADS * VEC_SIZE + tx * VEC_SIZE);
														
 
															+    }
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															+template <uint32_t BLOCK_THREADS, BlockReduceAlgorithm REDUCE_ALGORITHM,
														
 
															+          uint32_t VEC_SIZE, typename DType, typename IdType>
														
 
															+__global__ void TopKRenormProbKernel(DType* probs, DType* renormed_prob,
														
 
															+                                     IdType* top_k_arr, uint32_t top_k_val,
														
 
															+                                     uint32_t d) {
														
 
															+  const uint32_t bx = blockIdx.x, tx = threadIdx.x;
														
 
															+  const uint32_t row_idx = bx;
														
 
															+  uint32_t k = top_k_arr == nullptr ? top_k_val : top_k_arr[bx];
														
 
															+  float pivot = -std::numeric_limits<float>::infinity(), normalizer = 1;
														
 
															+  vec_t<DType, VEC_SIZE> probs_vec;
														
 
															+  if (k < d) {
														
 
															+    extern __shared__ __align__(
														
 
															+        alignof(RenormTempStorage<DType, BLOCK_THREADS, REDUCE_ALGO>))
														
 
															+        uint8_t smem_renorm[];
														
 
															+    auto& temp_storage =
														
 
															+        reinterpret_cast<RenormTempStorage<DType, BLOCK_THREADS, REDUCE_ALGO>&>(
														
 
															+            smem_renorm);
														
 
															+    temp_storage.data.max_val = DType(0);
														
 
															+    DType probs_greater_than_pivot[VEC_SIZE];  // pivot initialized to 0
														
 
															+
														
 
															+    DType threadlocal_max_val = DType(0);
														
 
															+    for (uint32_t i = 0; i < ceil_div(d, BLOCK_THREADS * VEC_SIZE); ++i) {
														
 
															+      probs_vec.fill(DType(0));
														
 
															+      if ((i * BLOCK_THREADS + tx) * VEC_SIZE < d) {
														
 
															+        probs_vec.load(probs + row_idx * d + i * BLOCK_THREADS * VEC_SIZE +
														
 
															+                       tx * VEC_SIZE);
														
 
															+      }
														
 
															+#pragma unroll
														
 
															+      for (uint32_t j = 0; j < VEC_SIZE; ++j) {
														
 
															+        probs_greater_than_pivot[j] = probs_vec[j];
														
 
															+      }
														
 
															+      threadlocal_max_val =
														
 
															+          max(threadlocal_max_val,
														
 
															+              BlockReduce<DType, BLOCK_THREADS, REDUCE_ALGORITHM>(
														
 
															+                  temp_storage.block_prim.reduce)
														
 
															+                  .Reduce<VEC_SIZE>(probs_greater_than_pivot, cub::Max()));
														
 
															+      __syncthreads();
														
 
															+    }
														
 
															+    if (tx == 0) {
														
 
															+      temp_storage.data.max_val = threadlocal_max_val;
														
 
															+    }
														
 
															+    __syncthreads();
														
 
															+    threadlocal_max_val = temp_storage.data.max_val;
														
 
															+
														
 
															+    float low = 0, high = threadlocal_max_val;
														
 
															+    DType min_gt_low, max_le_high;
														
 
															+    DType sum_low(1);
														
 
															+    // f(x) = len(nonzero(probs > x)), f(x) is non-increasing
														
 
															+    // min_gt_low = min{p \in probs | p > low}, max_le_high = max{p \in probs |
														
 
															+    // p <= high} loop invariant:
														
 
															+    // - f(low) >= k, f(high) < k
														
 
															+    // - f(low) > f(min_gt_low) >= f(max_le_high) == f(high)
														
 
															+    // stopping condition: min_gt_low == max_le_high
														
 
															+    // - f(low) >= k, f(min_gt_low) == f(max_le_high) == f(high) < k
														
 
															+    do {
														
 
															+      Pair<DType> threadlocal_sum{DType(0), 0};
														
 
															+      Pair<DType>
														
 
															+          probs_greater_than_pivot_pair[VEC_SIZE];  // pivot initialized to 0
														
 
															+      float mid = (low + high) / 2;
														
 
															+      min_gt_low = high;
														
 
															+      max_le_high = low;
														
 
															+      for (uint32_t i = 0; i < ceil_div(d, BLOCK_THREADS * VEC_SIZE); ++i) {
														
 
															+        probs_vec.fill(DType(0));
														
 
															+        if ((i * BLOCK_THREADS + tx) * VEC_SIZE < d) {
														
 
															+          probs_vec.load(probs + row_idx * d + i * BLOCK_THREADS * VEC_SIZE +
														
 
															+                         tx * VEC_SIZE);
														
 
															+        }
														
 
															+#pragma unroll
														
 
															+        for (uint32_t j = 0; j < VEC_SIZE; ++j) {
														
 
															+          probs_greater_than_pivot_pair[j] = {
														
 
															+              (probs_vec[j] > mid) ? probs_vec[j] : DType(0),
														
 
															+              (probs_vec[j] > mid &&
														
 
															+               (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d)};
														
 
															+          if (probs_vec[j] > low &&
														
 
															+              (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d) {
														
 
															+            min_gt_low = min(min_gt_low, probs_vec[j]);
														
 
															+          }
														
 
															+          if (probs_vec[j] <= high &&
														
 
															+              (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d) {
														
 
															+            max_le_high = max(max_le_high, probs_vec[j]);
														
 
															+          }
														
 
															+        }
														
 
															+        threadlocal_sum +=
														
 
															+            BlockReduce<Pair<DType>, BLOCK_THREADS, REDUCE_ALGORITHM>(
														
 
															+                temp_storage.block_prim.reduce_pair)
														
 
															+                .Sum<VEC_SIZE>(probs_greater_than_pivot_pair);
														
 
															+        __syncthreads();
														
 
															+      }
														
 
															+      min_gt_low = BlockReduce<DType, BLOCK_THREADS, REDUCE_ALGORITHM>(
														
 
															+                       temp_storage.block_prim.reduce)
														
 
															+                       .Reduce(min_gt_low, cub::Min());
														
 
															+      __syncthreads();
														
 
															+      max_le_high = BlockReduce<DType, BLOCK_THREADS, REDUCE_ALGORITHM>(
														
 
															+                        temp_storage.block_prim.reduce)
														
 
															+                        .Reduce(max_le_high, cub::Max());
														
 
															+      if (tx == 0) {
														
 
															+        temp_storage.data.block_aggregate.pair = threadlocal_sum;
														
 
															+        temp_storage.data.min_val = min_gt_low;
														
 
															+        temp_storage.data.max_val = max_le_high;
														
 
															+      }
														
 
															+      __syncthreads();
														
 
															+      threadlocal_sum = temp_storage.data.block_aggregate.pair;
														
 
															+      min_gt_low = temp_storage.data.min_val;
														
 
															+      max_le_high = temp_storage.data.max_val;
														
 
															+      if (threadlocal_sum.count >= k) {
														
 
															+        low = mid;
														
 
															+        sum_low = float(threadlocal_sum.value);
														
 
															+      } else {
														
 
															+        high = min(mid, max_le_high);
														
 
															+      }
														
 
															+    } while (min_gt_low != max_le_high);
														
 
															+
														
 
															+    normalizer = math::ptx_rcp(max(sum_low, 1e-8));
														
 
															+    pivot = low;
														
 
															+  }
														
 
															+
														
 
															+  // normalize
														
 
															+  for (uint32_t i = 0; i < ceil_div(d, BLOCK_THREADS * VEC_SIZE); ++i) {
														
 
															+    probs_vec.fill(DType(0));
														
 
															+    if ((i * BLOCK_THREADS + tx) * VEC_SIZE < d) {
														
 
															+      probs_vec.load(probs + row_idx * d + i * BLOCK_THREADS * VEC_SIZE +
														
 
															+                     tx * VEC_SIZE);
														
 
															+    }
														
 
															+#pragma unroll
														
 
															+    for (uint32_t j = 0; j < VEC_SIZE; ++j) {
														
 
															+      probs_vec[j] =
														
 
															+          (probs_vec[j] > pivot) ? probs_vec[j] * normalizer : DType(0);
														
 
															+    }
														
 
															+    if ((i * BLOCK_THREADS + tx) * VEC_SIZE < d) {
														
 
															+      probs_vec.store(renormed_prob + row_idx * d +
														
 
															+                      i * BLOCK_THREADS * VEC_SIZE + tx * VEC_SIZE);
														
 
															+    }
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															+template <typename DType>
														
 
															+cudaError_t TopPRenormProb(DType* probs, DType* renormed_prob, DType* top_p_arr,
														
 
															+                           uint32_t batch_size, float top_p_val, uint32_t d,
														
 
															+                           cudaStream_t stream = 0) {
														
 
															+  const uint32_t BLOCK_THREADS = 1024;
														
 
															+  const uint32_t vec_size = std::gcd(16 / sizeof(DType), d);
														
 
															+
														
 
															+  const uint32_t smem_size =
														
 
															+      sizeof(RenormTempStorage<DType, BLOCK_THREADS, REDUCE_ALGO>);
														
 
															+  dim3 nblks(batch_size);
														
 
															+  dim3 nthrs(BLOCK_THREADS);
														
 
															+  void* args[] = {&probs, &renormed_prob, &top_p_arr, &top_p_val, &d};
														
 
															+  DISPATCH_ALIGNED_VEC_SIZE(vec_size, VEC_SIZE, {
														
 
															+    auto kernel =
														
 
															+        TopPRenormProbKernel<BLOCK_THREADS, REDUCE_ALGO, VEC_SIZE, DType>;
														
 
															+    APHRODITE_CUDA_CALL(cudaFuncSetAttribute(
														
 
															+        kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
														
 
															+    APHRODITE_CUDA_CALL(
														
 
															+        cudaLaunchKernel((void*)kernel, nblks, nthrs, args, smem_size, stream));
														
 
															+  });
														
 
															+  return cudaSuccess;
														
 
															+}
														
 
															+
														
 
															+template <typename DType, typename IdType>
														
 
															+cudaError_t TopKRenormProb(DType* probs, DType* renormed_prob,
														
 
															+                           IdType* top_k_arr, uint32_t batch_size,
														
 
															+                           uint32_t top_k_val, uint32_t d,
														
 
															+                           cudaStream_t stream = 0) {
														
 
															+  const uint32_t BLOCK_THREADS = 1024;
														
 
															+  const uint32_t vec_size = std::gcd(16 / sizeof(DType), d);
														
 
															+
														
 
															+  const uint32_t smem_size =
														
 
															+      sizeof(RenormTempStorage<DType, BLOCK_THREADS, REDUCE_ALGO>);
														
 
															+  dim3 nblks(batch_size);
														
 
															+  dim3 nthrs(BLOCK_THREADS);
														
 
															+  void* args[] = {&probs, &renormed_prob, &top_k_arr, &top_k_val, &d};
														
 
															+  DISPATCH_ALIGNED_VEC_SIZE(vec_size, VEC_SIZE, {
														
 
															+    auto kernel = TopKRenormProbKernel<BLOCK_THREADS, REDUCE_ALGO, VEC_SIZE,
														
 
															+                                       DType, IdType>;
														
 
															+    APHRODITE_CUDA_CALL(cudaFuncSetAttribute(
														
 
															+        kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
														
 
															+    APHRODITE_CUDA_CALL(
														
 
															+        cudaLaunchKernel((void*)kernel, nblks, nthrs, args, smem_size, stream));
														
 
															+  });
														
 
															+  return cudaSuccess;
														
 
															+}
														
 
															+
														
 
															+template <typename DType, typename IdType>
														
 
															+cudaError_t TopKMaskLogits(DType* logits, DType* masked_logits,
														
 
															+                           IdType* top_k_arr, uint32_t batch_size,
														
 
															+                           uint32_t top_k_val, uint32_t d,
														
 
															+                           cudaStream_t stream = 0) {
														
 
															+  const uint32_t BLOCK_THREADS = 1024;
														
 
															+  const uint32_t vec_size = std::gcd(16 / sizeof(DType), d);
														
 
															+
														
 
															+  const uint32_t smem_size =
														
 
															+      sizeof(RenormTempStorage<DType, BLOCK_THREADS, REDUCE_ALGO>);
														
 
															+  dim3 nblks(batch_size);
														
 
															+  dim3 nthrs(BLOCK_THREADS);
														
 
															+  void* args[] = {&logits, &masked_logits, &top_k_arr, &top_k_val, &d};
														
 
															+  DISPATCH_ALIGNED_VEC_SIZE(vec_size, VEC_SIZE, {
														
 
															+    auto kernel = TopKMaskLogitsKernel<BLOCK_THREADS, REDUCE_ALGO, VEC_SIZE,
														
 
															+                                       DType, IdType>;
														
 
															+    APHRODITE_CUDA_CALL(cudaFuncSetAttribute(
														
 
															+        kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
														
 
															+    APHRODITE_CUDA_CALL(
														
 
															+        cudaLaunchKernel((void*)kernel, nblks, nthrs, args, smem_size, stream));
														
 
															+  });
														
 
															+  return cudaSuccess;
														
 
															+}
														
 
															+
														
 
															+template <typename T, typename IdType>
														
 
															+cudaError_t ParallelTopPSamplingFromProb(
														
 
															+    T* probs, T* uniform_samples, IdType* output, bool* success,
														
 
															+    IdType* row_indices, T* top_p_arr, uint32_t batch_size, uint32_t d,
														
 
															+    uint32_t max_top_p_rounds, bool deterministic, cudaStream_t stream = 0) {
														
 
															+  constexpr uint32_t BLOCK_THREADS = 1024;
														
 
															+  const uint32_t vec_size = std::gcd(16 / sizeof(T), d);
														
 
															+
														
 
															+  const uint32_t smem_size =
														
 
															+      sizeof(SamplingTempStorage<T, BLOCK_THREADS, SCAN_ALGO, REDUCE_ALGO>);
														
 
															+  dim3 nblks(batch_size);
														
 
															+  dim3 nthrs(BLOCK_THREADS);
														
 
															+  T top_p_placeholder = 0;
														
 
															+  void* args[] = {
														
 
															+      &probs,     &uniform_samples,   &output, &success,         &row_indices,
														
 
															+      &top_p_arr, &top_p_placeholder, &d,      &max_top_p_rounds};
														
 
															+
														
 
															+  DISPATCH_ALIGNED_VEC_SIZE(
														
 
															+      vec_size, VEC_SIZE,
														
 
															+      {DISPATCH_DETERMINISTIC(deterministic, DETERMINISTIC, {
														
 
															+        auto kernel =
														
 
															+            TopPSamplingFromProbKernel<BLOCK_THREADS, SCAN_ALGO, REDUCE_ALGO,
														
 
															+                                       VEC_SIZE, DETERMINISTIC, T, IdType>;
														
 
															+        APHRODITE_CUDA_CALL(cudaFuncSetAttribute(
														
 
															+            kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
														
 
															+        APHRODITE_CUDA_CALL(cudaLaunchKernel((void*)kernel, nblks, nthrs, args,
														
 
															+                                             smem_size, stream));
														
 
															+      })});
														
 
															+  return cudaSuccess;
														
 
															+}
														
 
															+
														
 
															+}  // namespace sampling
														
 
															+
														
 
															+}  // namespace aphrodite
														
 
															+
														
 
															+#endif  // APHRODITE_SAMPLING_CUH_
														
--- a/kernels/sampling/utils.cuh
+++ b/kernels/sampling/utils.cuh
@@ -0,0 +1,273 @@
 
															+/*
														
 
															+ * Copyright (c) 2024 by PygmalionAI team.
														
 
															+ * Copyright (c) 2023 by FlashInfer team.
														
 
															+ *
														
 
															+ * Licensed under the Apache License, Version 2.0 (the "License");
														
 
															+ * you may not use this file except in compliance with the License.
														
 
															+ * You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *   http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+#ifndef APHRODITE_UTILS_CUH_
														
 
															+#define APHRODITE_UTILS_CUH_
														
 
															+#include <cuda_runtime.h>
														
 
															+
														
 
															+#include <iostream>
														
 
															+#include <sstream>
														
 
															+#include <stdexcept>
														
 
															+#include <vector>
														
 
															+#include <torch/all.h>
														
 
															+
														
 
															+#define STR_HELPER(x) #x
														
 
															+#define STR(x) STR_HELPER(x)
														
 
															+
														
 
															+// macro to turn off fp16 qk reduction to reduce binary
														
 
															+#ifndef APHRODITE_ALWAYS_DISALLOW_FP16_QK_REDUCTION
														
 
															+  #define APHRODITE_ALWAYS_DISALLOW_FP16_QK_REDUCTION 0
														
 
															+#endif
														
 
															+
														
 
															+#ifndef NDEBUG
														
 
															+  #define APHRODITE_CUDA_CALL(func, ...)                                  \
														
 
															+    {                                                                     \
														
 
															+      cudaError_t e = (func);                                             \
														
 
															+      if (e != cudaSuccess) {                                             \
														
 
															+        std::cerr << "CUDA Error: " << cudaGetErrorString(e) << " (" << e \
														
 
															+                  << ") " << __FILE__ << ": line " << __LINE__            \
														
 
															+                  << " at function " << STR(func) << std::endl;           \
														
 
															+        return e;                                                         \
														
 
															+      }                                                                   \
														
 
															+    }
														
 
															+#else
														
 
															+  #define APHRODITE_CUDA_CALL(func, ...) \
														
 
															+    {                                    \
														
 
															+      cudaError_t e = (func);            \
														
 
															+      if (e != cudaSuccess) {            \
														
 
															+        return e;                        \
														
 
															+      }                                  \
														
 
															+    }
														
 
															+#endif
														
 
															+
														
 
															+#define DISPATCH_ALLOW_FP16_QK_REDUCTION(allow_fp16_qk_reduction,           \
														
 
															+                                         ALLOW_FP16_QK_REDUCTION, ...)      \
														
 
															+  if (allow_fp16_qk_reduction) {                                            \
														
 
															+    throw std::runtime_error("FP16_QK_REDUCTION disabled at compile time"); \
														
 
															+  } else {                                                                  \
														
 
															+    constexpr bool ALLOW_FP16_QK_REDUCTION = false;                         \
														
 
															+    __VA_ARGS__                                                             \
														
 
															+  }
														
 
															+
														
 
															+#define DISPATCH_NUM_FRAGS_X(num_frags_x, NUM_FRAGS_X, ...) \
														
 
															+  if (num_frags_x == 1) {                                   \
														
 
															+    constexpr size_t NUM_FRAGS_X = 1;                       \
														
 
															+    __VA_ARGS__                                             \
														
 
															+  } else if (num_frags_x == 2) {                            \
														
 
															+    constexpr size_t NUM_FRAGS_X = 2;                       \
														
 
															+    __VA_ARGS__                                             \
														
 
															+  } else {                                                  \
														
 
															+    std::ostringstream err_msg;                             \
														
 
															+    err_msg << "Unsupported num_frags_x: " << num_frags_x;  \
														
 
															+    throw std::invalid_argument(err_msg.str());             \
														
 
															+  }
														
 
															+
														
 
															+#define DISPATCH_NUM_FRAGS_Z(max_frags_z, NUM_FRAGS_Z, ...) \
														
 
															+  if (max_frags_z >= 8) {                                   \
														
 
															+    constexpr size_t NUM_FRAGS_Z = 8;                       \
														
 
															+    __VA_ARGS__                                             \
														
 
															+  } else if (max_frags_z >= 4) {                            \
														
 
															+    constexpr size_t NUM_FRAGS_Z = 4;                       \
														
 
															+    __VA_ARGS__                                             \
														
 
															+  } else if (max_frags_z >= 2) {                            \
														
 
															+    constexpr size_t NUM_FRAGS_Z = 2;                       \
														
 
															+    __VA_ARGS__                                             \
														
 
															+  } else if (max_frags_z >= 1) {                            \
														
 
															+    constexpr size_t NUM_FRAGS_Z = 1;                       \
														
 
															+    __VA_ARGS__                                             \
														
 
															+  } else {                                                  \
														
 
															+    std::ostringstream err_msg;                             \
														
 
															+    err_msg << "Unsupported max_frags_z: " << max_frags_z;  \
														
 
															+    throw std::invalid_argument(err_msg.str());             \
														
 
															+  }
														
 
															+
														
 
															+#define DISPATCH_GQA_GROUP_SIZE(group_size, GROUP_SIZE, ...) \
														
 
															+  if (group_size == 1) {                                     \
														
 
															+    constexpr size_t GROUP_SIZE = 1;                         \
														
 
															+    __VA_ARGS__                                              \
														
 
															+  } else if (group_size == 2) {                              \
														
 
															+    constexpr size_t GROUP_SIZE = 2;                         \
														
 
															+    __VA_ARGS__                                              \
														
 
															+  } else if (group_size == 4) {                              \
														
 
															+    constexpr size_t GROUP_SIZE = 4;                         \
														
 
															+    __VA_ARGS__                                              \
														
 
															+  } else if (group_size == 8) {                              \
														
 
															+    constexpr size_t GROUP_SIZE = 8;                         \
														
 
															+    __VA_ARGS__                                              \
														
 
															+  } else {                                                   \
														
 
															+    std::ostringstream err_msg;                              \
														
 
															+    err_msg << "Unsupported group_size: " << group_size;     \
														
 
															+    throw std::invalid_argument(err_msg.str());              \
														
 
															+  }
														
 
															+
														
 
															+#define DISPATCH_MASK_MODE(mask_mode, MASK_MODE, ...)         \
														
 
															+  switch (mask_mode) {                                        \
														
 
															+    case MaskMode::kNone: {                                   \
														
 
															+      constexpr MaskMode MASK_MODE = MaskMode::kNone;         \
														
 
															+      __VA_ARGS__                                             \
														
 
															+      break;                                                  \
														
 
															+    }                                                         \
														
 
															+    case MaskMode::kCausal: {                                 \
														
 
															+      constexpr MaskMode MASK_MODE = MaskMode::kCausal;       \
														
 
															+      __VA_ARGS__                                             \
														
 
															+      break;                                                  \
														
 
															+    }                                                         \
														
 
															+    case MaskMode::kCustom: {                                 \
														
 
															+      constexpr MaskMode MASK_MODE = MaskMode::kCustom;       \
														
 
															+      __VA_ARGS__                                             \
														
 
															+      break;                                                  \
														
 
															+    }                                                         \
														
 
															+    default: {                                                \
														
 
															+      std::ostringstream err_msg;                             \
														
 
															+      err_msg << "Unsupported mask_mode: " << int(mask_mode); \
														
 
															+      throw std::invalid_argument(err_msg.str());             \
														
 
															+    }                                                         \
														
 
															+  }
														
 
															+
														
 
															+#define DISPATCH_LOGITS_POST_HOOK(logits_soft_cap, LOGITS_POST_HOOK, ...) \
														
 
															+  if (logits_soft_cap > 0.f) {                                            \
														
 
															+    constexpr LogitsPostHook LOGITS_POST_HOOK = LogitsPostHook::kSoftCap; \
														
 
															+    __VA_ARGS__                                                           \
														
 
															+  } else if (logits_soft_cap == 0.f) {                                    \
														
 
															+    constexpr LogitsPostHook LOGITS_POST_HOOK = LogitsPostHook::kNone;    \
														
 
															+    __VA_ARGS__                                                           \
														
 
															+  } else {                                                                \
														
 
															+    std::ostringstream err_msg;                                           \
														
 
															+    err_msg << "Invalid logits_soft_cap (should be >= 0): "               \
														
 
															+            << logits_soft_cap;                                           \
														
 
															+    throw std::invalid_argument(err_msg.str());                           \
														
 
															+  }
														
 
															+
														
 
															+#define DISPATCH_HEAD_DIM(head_dim, HEAD_DIM, ...)     \
														
 
															+  switch (head_dim) {                                  \
														
 
															+    case 64: {                                         \
														
 
															+      constexpr size_t HEAD_DIM = 64;                  \
														
 
															+      __VA_ARGS__                                      \
														
 
															+      break;                                           \
														
 
															+    }                                                  \
														
 
															+    case 128: {                                        \
														
 
															+      constexpr size_t HEAD_DIM = 128;                 \
														
 
															+      __VA_ARGS__                                      \
														
 
															+      break;                                           \
														
 
															+    }                                                  \
														
 
															+    case 256: {                                        \
														
 
															+      constexpr size_t HEAD_DIM = 256;                 \
														
 
															+      __VA_ARGS__                                      \
														
 
															+      break;                                           \
														
 
															+    }                                                  \
														
 
															+    default: {                                         \
														
 
															+      std::ostringstream err_msg;                      \
														
 
															+      err_msg << "Unsupported head_dim: " << head_dim; \
														
 
															+      throw std::invalid_argument(err_msg.str());      \
														
 
															+    }                                                  \
														
 
															+  }
														
 
															+
														
 
															+#define DISPATCH_POS_ENCODING_MODE(pos_encoding_mode, POS_ENCODING_MODE, ...) \
														
 
															+  switch (pos_encoding_mode) {                                                \
														
 
															+    case PosEncodingMode::kNone: {                                            \
														
 
															+      constexpr PosEncodingMode POS_ENCODING_MODE = PosEncodingMode::kNone;   \
														
 
															+      __VA_ARGS__                                                             \
														
 
															+      break;                                                                  \
														
 
															+    }                                                                         \
														
 
															+    case PosEncodingMode::kRoPELlama: {                                       \
														
 
															+      constexpr PosEncodingMode POS_ENCODING_MODE =                           \
														
 
															+          PosEncodingMode::kRoPELlama;                                        \
														
 
															+      __VA_ARGS__                                                             \
														
 
															+      break;                                                                  \
														
 
															+    }                                                                         \
														
 
															+    case PosEncodingMode::kALiBi: {                                           \
														
 
															+      constexpr PosEncodingMode POS_ENCODING_MODE = PosEncodingMode::kALiBi;  \
														
 
															+      __VA_ARGS__                                                             \
														
 
															+      break;                                                                  \
														
 
															+    }                                                                         \
														
 
															+    default: {                                                                \
														
 
															+      std::ostringstream err_msg;                                             \
														
 
															+      err_msg << "Unsupported pos_encoding_mode: " << int(pos_encoding_mode); \
														
 
															+      throw std::invalid_argument(err_msg.str());                             \
														
 
															+    }                                                                         \
														
 
															+  }
														
 
															+
														
 
															+#define DISPATCH_ALIGNED_VEC_SIZE(aligned_vec_size, ALIGNED_VEC_SIZE, ...) \
														
 
															+  switch (aligned_vec_size) {                                              \
														
 
															+    case 16: {                                                             \
														
 
															+      constexpr size_t ALIGNED_VEC_SIZE = 16;                              \
														
 
															+      __VA_ARGS__                                                          \
														
 
															+      break;                                                               \
														
 
															+    }                                                                      \
														
 
															+    case 8: {                                                              \
														
 
															+      constexpr size_t ALIGNED_VEC_SIZE = 8;                               \
														
 
															+      __VA_ARGS__                                                          \
														
 
															+      break;                                                               \
														
 
															+    }                                                                      \
														
 
															+    case 4: {                                                              \
														
 
															+      constexpr size_t ALIGNED_VEC_SIZE = 4;                               \
														
 
															+      __VA_ARGS__                                                          \
														
 
															+      break;                                                               \
														
 
															+    }                                                                      \
														
 
															+    case 2: {                                                              \
														
 
															+      constexpr size_t ALIGNED_VEC_SIZE = 2;                               \
														
 
															+      __VA_ARGS__                                                          \
														
 
															+      break;                                                               \
														
 
															+    }                                                                      \
														
 
															+    case 1: {                                                              \
														
 
															+      constexpr size_t ALIGNED_VEC_SIZE = 1;                               \
														
 
															+      __VA_ARGS__                                                          \
														
 
															+      break;                                                               \
														
 
															+    }                                                                      \
														
 
															+    default: {                                                             \
														
 
															+      std::ostringstream err_msg;                                          \
														
 
															+      err_msg << "Unsupported aligned_vec_size: " << aligned_vec_size;     \
														
 
															+      throw std::invalid_argument(err_msg.str());                          \
														
 
															+    }                                                                      \
														
 
															+  }
														
 
															+
														
 
															+namespace aphrodite {
														
 
															+
														
 
															+template <typename T1, typename T2>
														
 
															+__forceinline__ __device__ __host__ T1 ceil_div(const T1 x, const T2 y) {
														
 
															+  return (x + y - 1) / y;
														
 
															+}
														
 
															+
														
 
															+template <typename T>
														
 
															+inline void DebugPrintCUDAArray(T* device_ptr, size_t size,
														
 
															+                                std::string prefix = "") {
														
 
															+  std::vector<T> host_array(size);
														
 
															+  std::cout << prefix;
														
 
															+  cudaMemcpy(host_array.data(), device_ptr, size * sizeof(T),
														
 
															+             cudaMemcpyDeviceToHost);
														
 
															+  for (size_t i = 0; i < size; ++i) {
														
 
															+    std::cout << host_array[i] << " ";
														
 
															+  }
														
 
															+  std::cout << std::endl;
														
 
															+}
														
 
															+
														
 
															+/*!
														
 
															+ * \brief Return x - y if x > y, otherwise return 0.
														
 
															+ */
														
 
															+__device__ __forceinline__ uint32_t sub_if_greater_or_zero(uint32_t x,
														
 
															+                                                           uint32_t y) {
														
 
															+  return (x > y) ? x - y : 0U;
														
 
															+}
														
 
															+
														
 
															+__device__ __forceinline__ void swap(uint32_t& a, uint32_t& b) {
														
 
															+  uint32_t tmp = a;
														
 
															+  a = b;
														
 
															+  b = tmp;
														
 
															+}
														
 
															+
														
 
															+}  // namespace aphrodite
														
 
															+
														
 
															+#endif  // APHRODITE_UTILS_CUH_
														
--- a/kernels/sampling/vec_dtypes.cuh
+++ b/kernels/sampling/vec_dtypes.cuh
@@ -0,0 +1,1501 @@
 
															+/*
														
 
															+ * Copyright (c) 2024 by PygmalionAI team.
														
 
															+ * Copyright (c) 2023 by FlashInfer team.
														
 
															+ *
														
 
															+ * Licensed under the Apache License, Version 2.0 (the "License");
														
 
															+ * you may not use this file except in compliance with the License.
														
 
															+ * You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *   http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+#ifndef VEC_DTYPES_CUH_
														
 
															+#define VEC_DTYPES_CUH_
														
 
															+
														
 
															+#include <cuda_bf16.h>
														
 
															+#include <cuda_fp16.h>
														
 
															+#include <cuda_fp8.h>
														
 
															+#include <cuda_runtime.h>
														
 
															+
														
 
															+#include <type_traits>
														
 
															+
														
 
															+namespace aphrodite {
														
 
															+
														
 
															+#if (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 900))
														
 
															+  #define APHRODITE_HARDWARE_FP8_CONVERSION_ENABLED
														
 
															+#endif
														
 
															+
														
 
															+#define APHRODITE_INLINE inline __attribute__((always_inline)) __device__
														
 
															+
														
 
															+/******************* vec_t type cast *******************/
														
 
															+
														
 
															+template <typename dst_t, typename src_t>
														
 
															+struct vec_cast {
														
 
															+  template <size_t vec_size>
														
 
															+  APHRODITE_INLINE static void cast(dst_t* dst, const src_t* src) {
														
 
															+#pragma unroll
														
 
															+    for (size_t i = 0; i < vec_size; ++i) {
														
 
															+      dst[i] = (dst_t)src[i];
														
 
															+    }
														
 
															+  }
														
 
															+};
														
 
															+
														
 
															+template <>
														
 
															+struct vec_cast<float, half> {
														
 
															+  template <size_t vec_size>
														
 
															+  APHRODITE_INLINE static void cast(float* dst, const half* src) {
														
 
															+    if constexpr (vec_size == 1) {
														
 
															+      dst[0] = (float)src[0];
														
 
															+    } else {
														
 
															+#pragma unroll
														
 
															+      for (size_t i = 0; i < vec_size / 2; ++i) {
														
 
															+        ((float2*)dst)[i] = __half22float2(((half2*)src)[i]);
														
 
															+      }
														
 
															+    }
														
 
															+  }
														
 
															+};
														
 
															+
														
 
															+template <>
														
 
															+struct vec_cast<half, float> {
														
 
															+  template <size_t vec_size>
														
 
															+  APHRODITE_INLINE static void cast(half* dst, const float* src) {
														
 
															+    if constexpr (vec_size == 1) {
														
 
															+      dst[0] = __float2half(src[0]);
														
 
															+    } else {
														
 
															+#pragma unroll
														
 
															+      for (size_t i = 0; i < vec_size / 2; ++i) {
														
 
															+        ((half2*)dst)[i] = __float22half2_rn(((float2*)src)[i]);
														
 
															+      }
														
 
															+    }
														
 
															+  }
														
 
															+};
														
 
															+
														
 
															+template <typename T>
														
 
															+constexpr APHRODITE_INLINE int get_exponent_bits() {
														
 
															+  if constexpr (std::is_same<T, __nv_fp8_e4m3>::value) {
														
 
															+    return 4;
														
 
															+  } else if constexpr (std::is_same<T, __nv_fp8_e5m2>::value) {
														
 
															+    return 5;
														
 
															+  } else if constexpr (std::is_same<T, half>::value) {
														
 
															+    return 5;
														
 
															+  } else if constexpr (std::is_same<T, nv_bfloat16>::value) {
														
 
															+    return 8;
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															+template <typename T>
														
 
															+constexpr APHRODITE_INLINE int get_mantissa_bits() {
														
 
															+  if constexpr (std::is_same<T, __nv_fp8_e4m3>::value) {
														
 
															+    return 3;
														
 
															+  } else if constexpr (std::is_same<T, __nv_fp8_e5m2>::value) {
														
 
															+    return 2;
														
 
															+  } else if constexpr (std::is_same<T, half>::value) {
														
 
															+    return 11;
														
 
															+  } else if constexpr (std::is_same<T, nv_bfloat16>::value) {
														
 
															+    return 7;
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															+/*!
														
 
															+ * \brief Fallback to software fast dequant implementation if hardware
														
 
															+ * dequantization is not available. \note Inspired by Marlin's fast
														
 
															+ * dequantization, but here we don't have to permute weights order. \ref
														
 
															+ * https://github.com/vllm-project/vllm/blob/6dffa4b0a6120159ef2fe44d695a46817aff65bc/csrc/quantization/fp8/fp8_marlin.cu#L120
														
 
															+ */
														
 
															+template <typename fp8_dtype, typename fp16_dtype>
														
 
															+__device__ void fast_dequant_f8f16x4(uint32_t* input, uint2* output) {
														
 
															+  uint32_t q = *input;
														
 
															+  if constexpr (std::is_same<fp8_dtype, __nv_fp8_e5m2>::value &&
														
 
															+                std::is_same<fp16_dtype, half>::value) {
														
 
															+    output->x = __byte_perm(0U, q, 0x5140);
														
 
															+    output->y = __byte_perm(0U, q, 0x7362);
														
 
															+  } else {
														
 
															+    constexpr int FP8_EXPONENT = get_exponent_bits<fp8_dtype>();
														
 
															+    constexpr int FP8_MANTISSA = get_mantissa_bits<fp8_dtype>();
														
 
															+    constexpr int FP16_EXPONENT = get_exponent_bits<fp16_dtype>();
														
 
															+
														
 
															+    constexpr int RIGHT_SHIFT = FP16_EXPONENT - FP8_EXPONENT;
														
 
															+    // Calculate MASK for extracting mantissa and exponent
														
 
															+    constexpr int MASK1 = 0x80000000;
														
 
															+    constexpr int MASK2 = MASK1 >> (FP8_EXPONENT + FP8_MANTISSA);
														
 
															+    constexpr int MASK3 = MASK2 & 0x7fffffff;
														
 
															+    constexpr int MASK = MASK3 | (MASK3 >> 16);
														
 
															+    q = __byte_perm(q, q, 0x1302);
														
 
															+
														
 
															+    // Extract and shift FP8 values to FP16 format
														
 
															+    uint32_t Out1 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT);
														
 
															+    uint32_t Out2 =
														
 
															+        ((q << 8) & 0x80008000) | (((q << 8) & MASK) >> RIGHT_SHIFT);
														
 
															+
														
 
															+    constexpr int BIAS_OFFSET =
														
 
															+        (1 << (FP16_EXPONENT - 1)) - (1 << (FP8_EXPONENT - 1));
														
 
															+    // Construct and apply exponent bias
														
 
															+    if constexpr (std::is_same<fp16_dtype, half>::value) {
														
 
															+      const half2 bias_reg = __float2half2_rn(float(1 << BIAS_OFFSET));
														
 
															+
														
 
															+      // Convert to half2 and apply bias
														
 
															+      *(half2*)&(output->x) =
														
 
															+          __hmul2(*reinterpret_cast<const half2*>(&Out1), bias_reg);
														
 
															+      *(half2*)&(output->y) =
														
 
															+          __hmul2(*reinterpret_cast<const half2*>(&Out2), bias_reg);
														
 
															+    } else {
														
 
															+      constexpr uint32_t BIAS = (BIAS_OFFSET + 127) << 23;
														
 
															+      const nv_bfloat162 bias_reg =
														
 
															+          __float2bfloat162_rn(*reinterpret_cast<const float*>(&BIAS));
														
 
															+      // Convert to bfloat162 and apply bias
														
 
															+      *(nv_bfloat162*)&(output->x) =
														
 
															+          __hmul2(*reinterpret_cast<const nv_bfloat162*>(&Out1), bias_reg);
														
 
															+      *(nv_bfloat162*)&(output->y) =
														
 
															+          __hmul2(*reinterpret_cast<const nv_bfloat162*>(&Out2), bias_reg);
														
 
															+    }
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															+template <>
														
 
															+struct vec_cast<nv_bfloat16, __nv_fp8_e4m3> {
														
 
															+  template <size_t vec_size>
														
 
															+  APHRODITE_INLINE static void cast(nv_bfloat16* dst,
														
 
															+                                    const __nv_fp8_e4m3* src) {
														
 
															+    if constexpr (vec_size == 1) {
														
 
															+      dst[0] = nv_bfloat16(src[0]);
														
 
															+    } else if constexpr (vec_size == 2) {
														
 
															+      dst[0] = nv_bfloat16(src[0]);
														
 
															+      dst[1] = nv_bfloat16(src[1]);
														
 
															+    } else {
														
 
															+      static_assert(vec_size % 4 == 0, "vec_size must be a multiple of 4");
														
 
															+#pragma unroll
														
 
															+      for (uint32_t i = 0; i < vec_size / 4; ++i) {
														
 
															+        fast_dequant_f8f16x4<__nv_fp8_e4m3, nv_bfloat16>((uint32_t*)&src[i * 4],
														
 
															+                                                         (uint2*)&dst[i * 4]);
														
 
															+      }
														
 
															+    }
														
 
															+  }
														
 
															+};
														
 
															+
														
 
															+template <>
														
 
															+struct vec_cast<nv_bfloat16, __nv_fp8_e5m2> {
														
 
															+  template <size_t vec_size>
														
 
															+  APHRODITE_INLINE static void cast(nv_bfloat16* dst,
														
 
															+                                    const __nv_fp8_e5m2* src) {
														
 
															+    if constexpr (vec_size == 1) {
														
 
															+      dst[0] = nv_bfloat16(src[0]);
														
 
															+    } else if constexpr (vec_size == 2) {
														
 
															+      dst[0] = nv_bfloat16(src[0]);
														
 
															+      dst[1] = nv_bfloat16(src[1]);
														
 
															+    } else {
														
 
															+      static_assert(vec_size % 4 == 0, "vec_size must be a multiple of 4");
														
 
															+#pragma unroll
														
 
															+      for (uint32_t i = 0; i < vec_size / 4; ++i) {
														
 
															+        fast_dequant_f8f16x4<__nv_fp8_e5m2, nv_bfloat16>((uint32_t*)&src[i * 4],
														
 
															+                                                         (uint2*)&dst[i * 4]);
														
 
															+      }
														
 
															+    }
														
 
															+  }
														
 
															+};
														
 
															+
														
 
															+template <>
														
 
															+struct vec_cast<__nv_fp8_e4m3, half> {
														
 
															+  template <size_t vec_size>
														
 
															+  APHRODITE_INLINE static void cast(__nv_fp8_e4m3* dst, const half* src) {
														
 
															+#ifdef APHRODITE_HARDWARE_FP8_CONVERSION_ENABLED
														
 
															+    if constexpr (vec_size == 1) {
														
 
															+      dst[0] = __nv_fp8_e4m3(src[0]);
														
 
															+    } else {
														
 
															+  #pragma unroll
														
 
															+      for (size_t i = 0; i < vec_size / 2; ++i) {
														
 
															+        uint16_t y;
														
 
															+        uint32_t x = *(uint32_t*)&src[i * 2];
														
 
															+        asm volatile("cvt.rn.satfinite.e4m3x2.f16x2 %0, %1;"
														
 
															+                     : "=h"(y)
														
 
															+                     : "r"(x));
														
 
															+        *(uint16_t*)&dst[i * 2] = y;
														
 
															+      }
														
 
															+    }
														
 
															+#else
														
 
															+  #pragma unroll
														
 
															+    for (size_t i = 0; i < vec_size; ++i) {
														
 
															+      dst[i] = __nv_fp8_e4m3(src[i]);
														
 
															+    }
														
 
															+#endif  // APHRODITE_HARDWARE_FP8_CONVERSION_ENABLED
														
 
															+  }
														
 
															+};
														
 
															+
														
 
															+template <>
														
 
															+struct vec_cast<__nv_fp8_e5m2, half> {
														
 
															+  template <size_t vec_size>
														
 
															+  APHRODITE_INLINE static void cast(__nv_fp8_e5m2* dst, const half* src) {
														
 
															+#ifdef APHRODITE_HARDWARE_FP8_CONVERSION_ENABLED
														
 
															+    if constexpr (vec_size == 1) {
														
 
															+      dst[0] = __nv_fp8_e5m2(src[0]);
														
 
															+    } else {
														
 
															+  #pragma unroll
														
 
															+      for (size_t i = 0; i < vec_size / 2; ++i) {
														
 
															+        uint16_t y;
														
 
															+        uint32_t x = *(uint32_t*)&src[i * 2];
														
 
															+        asm volatile("cvt.rn.satfinite.e5m2x2.f16x2 %0, %1;"
														
 
															+                     : "=h"(y)
														
 
															+                     : "r"(x));
														
 
															+        *(uint16_t*)&dst[i * 2] = y;
														
 
															+      }
														
 
															+    }
														
 
															+#else
														
 
															+  #pragma unroll
														
 
															+    for (size_t i = 0; i < vec_size; ++i) {
														
 
															+      dst[i] = __nv_fp8_e5m2(src[i]);
														
 
															+    }
														
 
															+#endif  // APHRODITE_HARDWARE_FP8_CONVERSION_ENABLED
														
 
															+  }
														
 
															+};
														
 
															+
														
 
															+template <>
														
 
															+struct vec_cast<half, __nv_fp8_e4m3> {
														
 
															+  template <size_t vec_size>
														
 
															+  APHRODITE_INLINE static void cast(half* dst, const __nv_fp8_e4m3* src) {
														
 
															+#ifdef APHRODITE_HARDWARE_FP8_CONVERSION_ENABLED
														
 
															+    if constexpr (vec_size == 1) {
														
 
															+      dst[0] = half(src[0]);
														
 
															+    } else {
														
 
															+  #pragma unroll
														
 
															+      for (size_t i = 0; i < vec_size / 2; ++i) {
														
 
															+        uint32_t y;
														
 
															+        uint16_t x = *(uint16_t*)&src[i * 2];
														
 
															+        asm volatile("cvt.rn.f16x2.e4m3x2 %0, %1;" : "=r"(y) : "h"(x));
														
 
															+        *(uint32_t*)&dst[i * 2] = y;
														
 
															+      }
														
 
															+    }
														
 
															+#else
														
 
															+    if constexpr (vec_size == 1) {
														
 
															+      dst[0] = half(src[0]);
														
 
															+    } else if constexpr (vec_size == 2) {
														
 
															+      dst[0] = half(src[0]);
														
 
															+      dst[1] = half(src[1]);
														
 
															+    } else {
														
 
															+      static_assert(vec_size % 4 == 0, "vec_size must be a multiple of 4");
														
 
															+  #pragma unroll
														
 
															+      for (uint32_t i = 0; i < vec_size / 4; ++i) {
														
 
															+        fast_dequant_f8f16x4<__nv_fp8_e4m3, half>((uint32_t*)&src[i * 4],
														
 
															+                                                  (uint2*)&dst[i * 4]);
														
 
															+      }
														
 
															+    }
														
 
															+#endif  // APHRODITE_HARDWARE_FP8_CONVERSION_ENABLED
														
 
															+  }
														
 
															+};
														
 
															+
														
 
															+template <>
														
 
															+struct vec_cast<half, __nv_fp8_e5m2> {
														
 
															+  template <size_t vec_size>
														
 
															+  APHRODITE_INLINE static void cast(half* dst, const __nv_fp8_e5m2* src) {
														
 
															+#ifdef APHRODITE_HARDWARE_FP8_CONVERSION_ENABLED
														
 
															+    if constexpr (vec_size == 1) {
														
 
															+      dst[0] = half(src[0]);
														
 
															+    } else {
														
 
															+  #pragma unroll
														
 
															+      for (size_t i = 0; i < vec_size / 2; ++i) {
														
 
															+        uint32_t y;
														
 
															+        uint16_t x = *(uint16_t*)&src[i * 2];
														
 
															+        asm volatile("cvt.rn.f16x2.e5m2x2 %0, %1;" : "=r"(y) : "h"(x));
														
 
															+        *(uint32_t*)&dst[i * 2] = y;
														
 
															+      }
														
 
															+    }
														
 
															+#else
														
 
															+    if constexpr (vec_size == 1) {
														
 
															+      dst[0] = half(src[0]);
														
 
															+    } else if constexpr (vec_size == 2) {
														
 
															+      dst[0] = half(src[0]);
														
 
															+      dst[1] = half(src[1]);
														
 
															+    } else {
														
 
															+      static_assert(vec_size % 4 == 0, "vec_size must be a multiple of 4");
														
 
															+  #pragma unroll
														
 
															+      for (uint32_t i = 0; i < vec_size / 4; ++i) {
														
 
															+        fast_dequant_f8f16x4<__nv_fp8_e5m2, half>((uint32_t*)&src[i * 4],
														
 
															+                                                  (uint2*)&dst[i * 4]);
														
 
															+      }
														
 
															+    }
														
 
															+#endif  // APHRODITE_HARDWARE_FP8_CONVERSION_ENABLED
														
 
															+  }
														
 
															+};
														
 
															+
														
 
															+template <>
														
 
															+struct vec_cast<float, nv_bfloat16> {
														
 
															+  template <size_t vec_size>
														
 
															+  APHRODITE_INLINE static void cast(float* dst, const nv_bfloat16* src) {
														
 
															+    if constexpr (vec_size == 1) {
														
 
															+      dst[0] = (float)src[0];
														
 
															+    } else {
														
 
															+#pragma unroll
														
 
															+      for (size_t i = 0; i < vec_size / 2; ++i) {
														
 
															+        ((float2*)dst)[i] = __bfloat1622float2(((nv_bfloat162*)src)[i]);
														
 
															+      }
														
 
															+    }
														
 
															+  }
														
 
															+};
														
 
															+
														
 
															+template <>
														
 
															+struct vec_cast<nv_bfloat16, float> {
														
 
															+  template <size_t vec_size>
														
 
															+  APHRODITE_INLINE static void cast(nv_bfloat16* dst, const float* src) {
														
 
															+    if constexpr (vec_size == 1) {
														
 
															+      dst[0] = nv_bfloat16(src[0]);
														
 
															+    } else {
														
 
															+#pragma unroll
														
 
															+      for (size_t i = 0; i < vec_size / 2; ++i) {
														
 
															+        ((nv_bfloat162*)dst)[i] = __float22bfloat162_rn(((float2*)src)[i]);
														
 
															+      }
														
 
															+    }
														
 
															+  }
														
 
															+};
														
 
															+
														
 
															+template <typename float_t, size_t vec_size>
														
 
															+struct vec_t {
														
 
															+  APHRODITE_INLINE float_t& operator[](size_t i);
														
 
															+  APHRODITE_INLINE const float_t& operator[](size_t i) const;
														
 
															+  APHRODITE_INLINE void fill(float_t val);
														
 
															+  APHRODITE_INLINE void load(const float_t* ptr);
														
 
															+  APHRODITE_INLINE void store(float_t* ptr) const;
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_from(const vec_t<T, vec_size>& src);
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_load(const T* ptr);
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_store(T* ptr) const;
														
 
															+  APHRODITE_INLINE static void memcpy(float_t* dst, const float_t* src);
														
 
															+  APHRODITE_INLINE float_t* ptr();
														
 
															+};
														
 
															+
														
 
															+template <typename src_float_t, typename tgt_float_t, size_t vec_size>
														
 
															+APHRODITE_INLINE void cast_from_impl(vec_t<tgt_float_t, vec_size>& dst,
														
 
															+                                     const vec_t<src_float_t, vec_size>& src) {
														
 
															+  vec_cast<tgt_float_t, src_float_t>::cast<vec_size>(
														
 
															+      dst.ptr(), const_cast<vec_t<src_float_t, vec_size>*>(&src)->ptr());
														
 
															+}
														
 
															+
														
 
															+template <typename src_float_t, typename tgt_float_t, size_t vec_size>
														
 
															+APHRODITE_INLINE void cast_load_impl(vec_t<tgt_float_t, vec_size>& dst,
														
 
															+                                     const src_float_t* src_ptr) {
														
 
															+  if constexpr (std::is_same<src_float_t, tgt_float_t>::value) {
														
 
															+    dst.load(src_ptr);
														
 
															+  } else {
														
 
															+    vec_t<src_float_t, vec_size> tmp;
														
 
															+    tmp.load(src_ptr);
														
 
															+    dst.cast_from(tmp);
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															+template <typename src_float_t, typename tgt_float_t, size_t vec_size>
														
 
															+APHRODITE_INLINE void cast_store_impl(tgt_float_t* dst_ptr,
														
 
															+                                      const vec_t<src_float_t, vec_size>& src) {
														
 
															+  if constexpr (std::is_same<src_float_t, tgt_float_t>::value) {
														
 
															+    src.store(dst_ptr);
														
 
															+  } else {
														
 
															+    vec_t<tgt_float_t, vec_size> tmp;
														
 
															+    tmp.cast_from(src);
														
 
															+    tmp.store(dst_ptr);
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															+/******************* vec_t<__nv_fp8_e4m3> *******************/
														
 
															+
														
 
															+// __nv_fp8_e4m3 x 1
														
 
															+template <>
														
 
															+struct vec_t<__nv_fp8_e4m3, 1> {
														
 
															+  __nv_fp8_e4m3 data;
														
 
															+
														
 
															+  APHRODITE_INLINE __nv_fp8_e4m3& operator[](size_t i) {
														
 
															+    return ((__nv_fp8_e4m3*)(&data))[i];
														
 
															+  }
														
 
															+  APHRODITE_INLINE const __nv_fp8_e4m3& operator[](size_t i) const {
														
 
															+    return ((const __nv_fp8_e4m3*)(&data))[i];
														
 
															+  }
														
 
															+  APHRODITE_INLINE __nv_fp8_e4m3* ptr() {
														
 
															+    return reinterpret_cast<__nv_fp8_e4m3*>(&data);
														
 
															+  }
														
 
															+  APHRODITE_INLINE void fill(__nv_fp8_e4m3 val);
														
 
															+  APHRODITE_INLINE void load(const __nv_fp8_e4m3* ptr);
														
 
															+  APHRODITE_INLINE void store(__nv_fp8_e4m3* ptr) const;
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_from(const vec_t<T, 1>& src) {
														
 
															+    cast_from_impl(*this, src);
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_load(const T* ptr) {
														
 
															+    cast_load_impl(*this, ptr);
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_store(T* ptr) const {
														
 
															+    cast_store_impl(ptr, *this);
														
 
															+  }
														
 
															+
														
 
															+  APHRODITE_INLINE static void memcpy(__nv_fp8_e4m3* dst,
														
 
															+                                      const __nv_fp8_e4m3* src);
														
 
															+};
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<__nv_fp8_e4m3, 1>::fill(__nv_fp8_e4m3 val) {
														
 
															+  data = val;
														
 
															+}
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<__nv_fp8_e4m3, 1>::load(const __nv_fp8_e4m3* ptr) {
														
 
															+  data = *ptr;
														
 
															+}
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<__nv_fp8_e4m3, 1>::store(__nv_fp8_e4m3* ptr) const {
														
 
															+  *ptr = data;
														
 
															+}
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<__nv_fp8_e4m3, 1>::memcpy(
														
 
															+    __nv_fp8_e4m3* dst, const __nv_fp8_e4m3* src) {
														
 
															+  *dst = *src;
														
 
															+}
														
 
															+
														
 
															+// __nv_fp8_e4m3 x 2
														
 
															+template <>
														
 
															+struct vec_t<__nv_fp8_e4m3, 2> {
														
 
															+  __nv_fp8x2_e4m3 data;
														
 
															+
														
 
															+  APHRODITE_INLINE __nv_fp8_e4m3& operator[](size_t i) {
														
 
															+    return ((__nv_fp8_e4m3*)(&data))[i];
														
 
															+  }
														
 
															+  APHRODITE_INLINE const __nv_fp8_e4m3& operator[](size_t i) const {
														
 
															+    return ((const __nv_fp8_e4m3*)(&data))[i];
														
 
															+  }
														
 
															+  APHRODITE_INLINE __nv_fp8_e4m3* ptr() {
														
 
															+    return reinterpret_cast<__nv_fp8_e4m3*>(&data);
														
 
															+  }
														
 
															+  APHRODITE_INLINE void fill(__nv_fp8_e4m3 val);
														
 
															+  APHRODITE_INLINE void load(const __nv_fp8_e4m3* ptr);
														
 
															+  APHRODITE_INLINE void store(__nv_fp8_e4m3* ptr) const;
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_from(const vec_t<T, 2>& src) {
														
 
															+    cast_from_impl(*this, src);
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_load(const T* ptr) {
														
 
															+    cast_load_impl(*this, ptr);
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_store(T* ptr) const {
														
 
															+    cast_store_impl(ptr, *this);
														
 
															+  }
														
 
															+  APHRODITE_INLINE static void memcpy(__nv_fp8_e4m3* dst,
														
 
															+                                      const __nv_fp8_e4m3* src);
														
 
															+};
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<__nv_fp8_e4m3, 2>::fill(__nv_fp8_e4m3 val) {
														
 
															+  data.__x =
														
 
															+      (__nv_fp8x2_storage_t(val.__x) << 8) | __nv_fp8x2_storage_t(val.__x);
														
 
															+}
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<__nv_fp8_e4m3, 2>::load(const __nv_fp8_e4m3* ptr) {
														
 
															+  data = *((__nv_fp8x2_e4m3*)ptr);
														
 
															+}
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<__nv_fp8_e4m3, 2>::store(__nv_fp8_e4m3* ptr) const {
														
 
															+  *((__nv_fp8x2_e4m3*)ptr) = data;
														
 
															+}
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<__nv_fp8_e4m3, 2>::memcpy(
														
 
															+    __nv_fp8_e4m3* dst, const __nv_fp8_e4m3* src) {
														
 
															+  *((__nv_fp8x2_e4m3*)dst) = *((__nv_fp8x2_e4m3*)src);
														
 
															+}
														
 
															+
														
 
															+// __nv_fp8_e4m3 x 4
														
 
															+
														
 
															+template <>
														
 
															+struct vec_t<__nv_fp8_e4m3, 4> {
														
 
															+  __nv_fp8x4_e4m3 data;
														
 
															+
														
 
															+  APHRODITE_INLINE __nv_fp8_e4m3& operator[](size_t i) {
														
 
															+    return ((__nv_fp8_e4m3*)(&data))[i];
														
 
															+  }
														
 
															+  APHRODITE_INLINE const __nv_fp8_e4m3& operator[](size_t i) const {
														
 
															+    return ((const __nv_fp8_e4m3*)(&data))[i];
														
 
															+  }
														
 
															+  APHRODITE_INLINE __nv_fp8_e4m3* ptr() {
														
 
															+    return reinterpret_cast<__nv_fp8_e4m3*>(&data);
														
 
															+  }
														
 
															+  APHRODITE_INLINE void fill(__nv_fp8_e4m3 val);
														
 
															+  APHRODITE_INLINE void load(const __nv_fp8_e4m3* ptr);
														
 
															+  APHRODITE_INLINE void store(__nv_fp8_e4m3* ptr) const;
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_from(const vec_t<T, 4>& src) {
														
 
															+    cast_from_impl(*this, src);
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_load(const T* ptr) {
														
 
															+    cast_load_impl(*this, ptr);
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_store(T* ptr) const {
														
 
															+    cast_store_impl(ptr, *this);
														
 
															+  }
														
 
															+
														
 
															+  APHRODITE_INLINE static void memcpy(__nv_fp8_e4m3* dst,
														
 
															+                                      const __nv_fp8_e4m3* src);
														
 
															+};
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<__nv_fp8_e4m3, 4>::fill(__nv_fp8_e4m3 val) {
														
 
															+  data.__x = (__nv_fp8x4_storage_t(val.__x) << 24) |
														
 
															+             (__nv_fp8x4_storage_t(val.__x) << 16) |
														
 
															+             (__nv_fp8x4_storage_t(val.__x) << 8) |
														
 
															+             __nv_fp8x4_storage_t(val.__x);
														
 
															+}
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<__nv_fp8_e4m3, 4>::load(const __nv_fp8_e4m3* ptr) {
														
 
															+  data = *((__nv_fp8x4_e4m3*)ptr);
														
 
															+}
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<__nv_fp8_e4m3, 4>::store(__nv_fp8_e4m3* ptr) const {
														
 
															+  *((__nv_fp8x4_e4m3*)ptr) = data;
														
 
															+}
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<__nv_fp8_e4m3, 4>::memcpy(
														
 
															+    __nv_fp8_e4m3* dst, const __nv_fp8_e4m3* src) {
														
 
															+  *((__nv_fp8x4_e4m3*)dst) = *((__nv_fp8x4_e4m3*)src);
														
 
															+}
														
 
															+
														
 
															+// __nv_fp8_e4m3 x 8
														
 
															+
														
 
															+template <>
														
 
															+struct vec_t<__nv_fp8_e4m3, 8> {
														
 
															+  uint2 data;
														
 
															+
														
 
															+  APHRODITE_INLINE __nv_fp8_e4m3& operator[](size_t i) {
														
 
															+    return ((__nv_fp8_e4m3*)(&data))[i];
														
 
															+  }
														
 
															+  APHRODITE_INLINE const __nv_fp8_e4m3& operator[](size_t i) const {
														
 
															+    return ((const __nv_fp8_e4m3*)(&data))[i];
														
 
															+  }
														
 
															+  APHRODITE_INLINE __nv_fp8_e4m3* ptr() {
														
 
															+    return reinterpret_cast<__nv_fp8_e4m3*>(&data);
														
 
															+  }
														
 
															+  APHRODITE_INLINE void fill(__nv_fp8_e4m3 val);
														
 
															+  APHRODITE_INLINE void load(const __nv_fp8_e4m3* ptr);
														
 
															+  APHRODITE_INLINE void store(__nv_fp8_e4m3* ptr) const;
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_from(const vec_t<T, 8>& src) {
														
 
															+    cast_from_impl(*this, src);
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_load(const T* ptr) {
														
 
															+    cast_load_impl(*this, ptr);
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_store(T* ptr) const {
														
 
															+    cast_store_impl(ptr, *this);
														
 
															+  }
														
 
															+
														
 
															+  APHRODITE_INLINE static void memcpy(__nv_fp8_e4m3* dst,
														
 
															+                                      const __nv_fp8_e4m3* src);
														
 
															+};
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<__nv_fp8_e4m3, 8>::fill(__nv_fp8_e4m3 val) {
														
 
															+  ((__nv_fp8x4_e4m3*)(&data.x))->__x = (__nv_fp8x4_storage_t(val.__x) << 24) |
														
 
															+                                       (__nv_fp8x4_storage_t(val.__x) << 16) |
														
 
															+                                       (__nv_fp8x4_storage_t(val.__x) << 8) |
														
 
															+                                       __nv_fp8x4_storage_t(val.__x);
														
 
															+  ((__nv_fp8x4_e4m3*)(&data.y))->__x = (__nv_fp8x4_storage_t(val.__x) << 24) |
														
 
															+                                       (__nv_fp8x4_storage_t(val.__x) << 16) |
														
 
															+                                       (__nv_fp8x4_storage_t(val.__x) << 8) |
														
 
															+                                       __nv_fp8x4_storage_t(val.__x);
														
 
															+}
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<__nv_fp8_e4m3, 8>::load(const __nv_fp8_e4m3* ptr) {
														
 
															+  data = *((uint2*)ptr);
														
 
															+}
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<__nv_fp8_e4m3, 8>::store(__nv_fp8_e4m3* ptr) const {
														
 
															+  *((uint2*)ptr) = data;
														
 
															+}
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<__nv_fp8_e4m3, 8>::memcpy(
														
 
															+    __nv_fp8_e4m3* dst, const __nv_fp8_e4m3* src) {
														
 
															+  *((uint2*)dst) = *((uint2*)src);
														
 
															+}
														
 
															+
														
 
															+// __nv_fp8_e4m3 x 16 or more
														
 
															+template <size_t vec_size>
														
 
															+struct vec_t<__nv_fp8_e4m3, vec_size> {
														
 
															+  uint4 data[vec_size / 16];
														
 
															+
														
 
															+  APHRODITE_INLINE __nv_fp8_e4m3& operator[](size_t i) {
														
 
															+    return ((__nv_fp8_e4m3*)data)[i];
														
 
															+  }
														
 
															+  APHRODITE_INLINE const __nv_fp8_e4m3& operator[](size_t i) const {
														
 
															+    return ((const __nv_fp8_e4m3*)data)[i];
														
 
															+  }
														
 
															+  APHRODITE_INLINE __nv_fp8_e4m3* ptr() {
														
 
															+    return reinterpret_cast<__nv_fp8_e4m3*>(&data);
														
 
															+  }
														
 
															+  APHRODITE_INLINE void fill(__nv_fp8_e4m3 val) {
														
 
															+#pragma unroll
														
 
															+    for (size_t i = 0; i < vec_size / 16; ++i) {
														
 
															+      ((__nv_fp8x4_e4m3*)(&(data[i].x)))->__x =
														
 
															+          (__nv_fp8x4_storage_t(val.__x) << 24) |
														
 
															+          (__nv_fp8x4_storage_t(val.__x) << 16) |
														
 
															+          (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
														
 
															+      ((__nv_fp8x4_e4m3*)(&(data[i].y)))->__x =
														
 
															+          (__nv_fp8x4_storage_t(val.__x) << 24) |
														
 
															+          (__nv_fp8x4_storage_t(val.__x) << 16) |
														
 
															+          (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
														
 
															+      ((__nv_fp8x4_e4m3*)(&(data[i].z)))->__x =
														
 
															+          (__nv_fp8x4_storage_t(val.__x) << 24) |
														
 
															+          (__nv_fp8x4_storage_t(val.__x) << 16) |
														
 
															+          (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
														
 
															+      ((__nv_fp8x4_e4m3*)(&(data[i].w)))->__x =
														
 
															+          (__nv_fp8x4_storage_t(val.__x) << 24) |
														
 
															+          (__nv_fp8x4_storage_t(val.__x) << 16) |
														
 
															+          (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
														
 
															+    }
														
 
															+  }
														
 
															+  APHRODITE_INLINE void load(const __nv_fp8_e4m3* ptr) {
														
 
															+#pragma unroll
														
 
															+    for (size_t i = 0; i < vec_size / 16; ++i) {
														
 
															+      data[i] = ((uint4*)ptr)[i];
														
 
															+    }
														
 
															+  }
														
 
															+  APHRODITE_INLINE void store(__nv_fp8_e4m3* ptr) const {
														
 
															+#pragma unroll
														
 
															+    for (size_t i = 0; i < vec_size / 16; ++i) {
														
 
															+      ((uint4*)ptr)[i] = data[i];
														
 
															+    }
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_from(const vec_t<T, vec_size>& src) {
														
 
															+    cast_from_impl(*this, src);
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_load(const T* ptr) {
														
 
															+    cast_load_impl(*this, ptr);
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_store(T* ptr) const {
														
 
															+    cast_store_impl(ptr, *this);
														
 
															+  }
														
 
															+
														
 
															+  APHRODITE_INLINE static void memcpy(__nv_fp8_e4m3* dst,
														
 
															+                                      const __nv_fp8_e4m3* src) {
														
 
															+#pragma unroll
														
 
															+    for (size_t i = 0; i < vec_size / 16; ++i) {
														
 
															+      ((uint4*)dst)[i] = ((uint4*)src)[i];
														
 
															+    }
														
 
															+  }
														
 
															+};
														
 
															+
														
 
															+/******************* vec_t<__nv_fp8_e5m2> *******************/
														
 
															+
														
 
															+// __nv_fp8_e5m2 x 1
														
 
															+template <>
														
 
															+struct vec_t<__nv_fp8_e5m2, 1> {
														
 
															+  __nv_fp8_e5m2 data;
														
 
															+
														
 
															+  APHRODITE_INLINE __nv_fp8_e5m2& operator[](size_t i) {
														
 
															+    return ((__nv_fp8_e5m2*)(&data))[i];
														
 
															+  }
														
 
															+  APHRODITE_INLINE const __nv_fp8_e5m2& operator[](size_t i) const {
														
 
															+    return ((const __nv_fp8_e5m2*)(&data))[i];
														
 
															+  }
														
 
															+  APHRODITE_INLINE __nv_fp8_e5m2* ptr() {
														
 
															+    return reinterpret_cast<__nv_fp8_e5m2*>(&data);
														
 
															+  }
														
 
															+  APHRODITE_INLINE void fill(__nv_fp8_e5m2 val);
														
 
															+  APHRODITE_INLINE void load(const __nv_fp8_e5m2* ptr);
														
 
															+  APHRODITE_INLINE void store(__nv_fp8_e5m2* ptr) const;
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_from(const vec_t<T, 1>& src) {
														
 
															+    cast_from_impl(*this, src);
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_load(const T* ptr) {
														
 
															+    cast_load_impl(*this, ptr);
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_store(T* ptr) const {
														
 
															+    cast_store_impl(ptr, *this);
														
 
															+  }
														
 
															+
														
 
															+  APHRODITE_INLINE static void memcpy(__nv_fp8_e5m2* dst,
														
 
															+                                      const __nv_fp8_e5m2* src);
														
 
															+};
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<__nv_fp8_e5m2, 1>::fill(__nv_fp8_e5m2 val) {
														
 
															+  data = val;
														
 
															+}
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<__nv_fp8_e5m2, 1>::load(const __nv_fp8_e5m2* ptr) {
														
 
															+  data = *ptr;
														
 
															+}
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<__nv_fp8_e5m2, 1>::store(__nv_fp8_e5m2* ptr) const {
														
 
															+  *ptr = data;
														
 
															+}
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<__nv_fp8_e5m2, 1>::memcpy(
														
 
															+    __nv_fp8_e5m2* dst, const __nv_fp8_e5m2* src) {
														
 
															+  *dst = *src;
														
 
															+}
														
 
															+
														
 
															+// __nv_fp8_e5m2 x 2
														
 
															+template <>
														
 
															+struct vec_t<__nv_fp8_e5m2, 2> {
														
 
															+  __nv_fp8x2_e5m2 data;
														
 
															+
														
 
															+  APHRODITE_INLINE __nv_fp8_e5m2& operator[](size_t i) {
														
 
															+    return ((__nv_fp8_e5m2*)(&data))[i];
														
 
															+  }
														
 
															+  APHRODITE_INLINE const __nv_fp8_e5m2& operator[](size_t i) const {
														
 
															+    return ((const __nv_fp8_e5m2*)(&data))[i];
														
 
															+  }
														
 
															+  APHRODITE_INLINE __nv_fp8_e5m2* ptr() {
														
 
															+    return reinterpret_cast<__nv_fp8_e5m2*>(&data);
														
 
															+  }
														
 
															+  APHRODITE_INLINE void fill(__nv_fp8_e5m2 val);
														
 
															+  APHRODITE_INLINE void load(const __nv_fp8_e5m2* ptr);
														
 
															+  APHRODITE_INLINE void store(__nv_fp8_e5m2* ptr) const;
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_from(const vec_t<T, 2>& src) {
														
 
															+    cast_from_impl(*this, src);
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_load(const T* ptr) {
														
 
															+    cast_load_impl(*this, ptr);
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_store(T* ptr) const {
														
 
															+    cast_store_impl(ptr, *this);
														
 
															+  }
														
 
															+
														
 
															+  APHRODITE_INLINE static void memcpy(__nv_fp8_e5m2* dst,
														
 
															+                                      const __nv_fp8_e5m2* src);
														
 
															+};
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<__nv_fp8_e5m2, 2>::fill(__nv_fp8_e5m2 val) {
														
 
															+  data.__x =
														
 
															+      (__nv_fp8x2_storage_t(val.__x) << 8) | __nv_fp8x2_storage_t(val.__x);
														
 
															+}
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<__nv_fp8_e5m2, 2>::load(const __nv_fp8_e5m2* ptr) {
														
 
															+  data = *((__nv_fp8x2_e5m2*)ptr);
														
 
															+}
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<__nv_fp8_e5m2, 2>::store(__nv_fp8_e5m2* ptr) const {
														
 
															+  *((__nv_fp8x2_e5m2*)ptr) = data;
														
 
															+}
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<__nv_fp8_e5m2, 2>::memcpy(
														
 
															+    __nv_fp8_e5m2* dst, const __nv_fp8_e5m2* src) {
														
 
															+  *((__nv_fp8x2_e5m2*)dst) = *((__nv_fp8x2_e5m2*)src);
														
 
															+}
														
 
															+
														
 
															+// __nv_fp8_e5m2 x 4
														
 
															+
														
 
															+template <>
														
 
															+struct vec_t<__nv_fp8_e5m2, 4> {
														
 
															+  __nv_fp8x4_e5m2 data;
														
 
															+
														
 
															+  APHRODITE_INLINE __nv_fp8_e5m2& operator[](size_t i) {
														
 
															+    return ((__nv_fp8_e5m2*)(&data))[i];
														
 
															+  }
														
 
															+  APHRODITE_INLINE const __nv_fp8_e5m2& operator[](size_t i) const {
														
 
															+    return ((const __nv_fp8_e5m2*)(&data))[i];
														
 
															+  }
														
 
															+  APHRODITE_INLINE __nv_fp8_e5m2* ptr() {
														
 
															+    return reinterpret_cast<__nv_fp8_e5m2*>(&data);
														
 
															+  }
														
 
															+  APHRODITE_INLINE void fill(__nv_fp8_e5m2 val);
														
 
															+  APHRODITE_INLINE void load(const __nv_fp8_e5m2* ptr);
														
 
															+  APHRODITE_INLINE void store(__nv_fp8_e5m2* ptr) const;
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_from(const vec_t<T, 4>& src) {
														
 
															+    cast_from_impl(*this, src);
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_load(const T* ptr) {
														
 
															+    cast_load_impl(*this, ptr);
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_store(T* ptr) const {
														
 
															+    cast_store_impl(ptr, *this);
														
 
															+  }
														
 
															+
														
 
															+  APHRODITE_INLINE static void memcpy(__nv_fp8_e5m2* dst,
														
 
															+                                      const __nv_fp8_e5m2* src);
														
 
															+};
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<__nv_fp8_e5m2, 4>::fill(__nv_fp8_e5m2 val) {
														
 
															+  data.__x = (__nv_fp8x4_storage_t(val.__x) << 24) |
														
 
															+             (__nv_fp8x4_storage_t(val.__x) << 16) |
														
 
															+             (__nv_fp8x4_storage_t(val.__x) << 8) |
														
 
															+             __nv_fp8x4_storage_t(val.__x);
														
 
															+}
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<__nv_fp8_e5m2, 4>::load(const __nv_fp8_e5m2* ptr) {
														
 
															+  data = *((__nv_fp8x4_e5m2*)ptr);
														
 
															+}
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<__nv_fp8_e5m2, 4>::store(__nv_fp8_e5m2* ptr) const {
														
 
															+  *((__nv_fp8x4_e5m2*)ptr) = data;
														
 
															+}
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<__nv_fp8_e5m2, 4>::memcpy(
														
 
															+    __nv_fp8_e5m2* dst, const __nv_fp8_e5m2* src) {
														
 
															+  *((__nv_fp8x4_e5m2*)dst) = *((__nv_fp8x4_e5m2*)src);
														
 
															+}
														
 
															+
														
 
															+// __nv_fp8_e5m2 x 8
														
 
															+
														
 
															+template <>
														
 
															+struct vec_t<__nv_fp8_e5m2, 8> {
														
 
															+  uint2 data;
														
 
															+
														
 
															+  APHRODITE_INLINE __nv_fp8_e5m2& operator[](size_t i) {
														
 
															+    return ((__nv_fp8_e5m2*)(&data))[i];
														
 
															+  }
														
 
															+  APHRODITE_INLINE const __nv_fp8_e5m2& operator[](size_t i) const {
														
 
															+    return ((const __nv_fp8_e5m2*)(&data))[i];
														
 
															+  }
														
 
															+  APHRODITE_INLINE __nv_fp8_e5m2* ptr() {
														
 
															+    return reinterpret_cast<__nv_fp8_e5m2*>(&data);
														
 
															+  }
														
 
															+  APHRODITE_INLINE void fill(__nv_fp8_e5m2 val);
														
 
															+  APHRODITE_INLINE void load(const __nv_fp8_e5m2* ptr);
														
 
															+  APHRODITE_INLINE void store(__nv_fp8_e5m2* ptr) const;
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_from(const vec_t<T, 8>& src) {
														
 
															+    cast_from_impl(*this, src);
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_load(const T* ptr) {
														
 
															+    cast_load_impl(*this, ptr);
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_store(T* ptr) const {
														
 
															+    cast_store_impl(ptr, *this);
														
 
															+  }
														
 
															+  APHRODITE_INLINE static void memcpy(__nv_fp8_e5m2* dst,
														
 
															+                                      const __nv_fp8_e5m2* src);
														
 
															+};
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<__nv_fp8_e5m2, 8>::fill(__nv_fp8_e5m2 val) {
														
 
															+  ((__nv_fp8x4_e5m2*)(&data.x))->__x = (__nv_fp8x4_storage_t(val.__x) << 24) |
														
 
															+                                       (__nv_fp8x4_storage_t(val.__x) << 16) |
														
 
															+                                       (__nv_fp8x4_storage_t(val.__x) << 8) |
														
 
															+                                       __nv_fp8x4_storage_t(val.__x);
														
 
															+  ((__nv_fp8x4_e5m2*)(&data.y))->__x = (__nv_fp8x4_storage_t(val.__x) << 24) |
														
 
															+                                       (__nv_fp8x4_storage_t(val.__x) << 16) |
														
 
															+                                       (__nv_fp8x4_storage_t(val.__x) << 8) |
														
 
															+                                       __nv_fp8x4_storage_t(val.__x);
														
 
															+}
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<__nv_fp8_e5m2, 8>::load(const __nv_fp8_e5m2* ptr) {
														
 
															+  data = *((uint2*)ptr);
														
 
															+}
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<__nv_fp8_e5m2, 8>::store(__nv_fp8_e5m2* ptr) const {
														
 
															+  *((uint2*)ptr) = data;
														
 
															+}
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<__nv_fp8_e5m2, 8>::memcpy(
														
 
															+    __nv_fp8_e5m2* dst, const __nv_fp8_e5m2* src) {
														
 
															+  *((uint2*)dst) = *((uint2*)src);
														
 
															+}
														
 
															+
														
 
															+// __nv_fp8_e5m2 x 16 or more
														
 
															+
														
 
															+template <size_t vec_size>
														
 
															+struct vec_t<__nv_fp8_e5m2, vec_size> {
														
 
															+  uint4 data[vec_size / 16];
														
 
															+
														
 
															+  APHRODITE_INLINE __nv_fp8_e5m2& operator[](size_t i) {
														
 
															+    return ((__nv_fp8_e5m2*)data)[i];
														
 
															+  }
														
 
															+  APHRODITE_INLINE const __nv_fp8_e5m2& operator[](size_t i) const {
														
 
															+    return ((const __nv_fp8_e5m2*)data)[i];
														
 
															+  }
														
 
															+  APHRODITE_INLINE __nv_fp8_e5m2* ptr() {
														
 
															+    return reinterpret_cast<__nv_fp8_e5m2*>(&data);
														
 
															+  }
														
 
															+  APHRODITE_INLINE void fill(__nv_fp8_e5m2 val) {
														
 
															+#pragma unroll
														
 
															+    for (size_t i = 0; i < vec_size / 16; ++i) {
														
 
															+      ((__nv_fp8x4_e5m2*)(&(data[i].x)))->__x =
														
 
															+          (__nv_fp8x4_storage_t(val.__x) << 24) |
														
 
															+          (__nv_fp8x4_storage_t(val.__x) << 16) |
														
 
															+          (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
														
 
															+      ((__nv_fp8x4_e5m2*)(&(data[i].y)))->__x =
														
 
															+          (__nv_fp8x4_storage_t(val.__x) << 24) |
														
 
															+          (__nv_fp8x4_storage_t(val.__x) << 16) |
														
 
															+          (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
														
 
															+      ((__nv_fp8x4_e5m2*)(&(data[i].z)))->__x =
														
 
															+          (__nv_fp8x4_storage_t(val.__x) << 24) |
														
 
															+          (__nv_fp8x4_storage_t(val.__x) << 16) |
														
 
															+          (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
														
 
															+      ((__nv_fp8x4_e5m2*)(&(data[i].w)))->__x =
														
 
															+          (__nv_fp8x4_storage_t(val.__x) << 24) |
														
 
															+          (__nv_fp8x4_storage_t(val.__x) << 16) |
														
 
															+          (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
														
 
															+    }
														
 
															+  }
														
 
															+  APHRODITE_INLINE void load(const __nv_fp8_e5m2* ptr) {
														
 
															+#pragma unroll
														
 
															+    for (size_t i = 0; i < vec_size / 16; ++i) {
														
 
															+      data[i] = ((uint4*)ptr)[i];
														
 
															+    }
														
 
															+  }
														
 
															+  APHRODITE_INLINE void store(__nv_fp8_e5m2* ptr) const {
														
 
															+#pragma unroll
														
 
															+    for (size_t i = 0; i < vec_size / 16; ++i) {
														
 
															+      ((uint4*)ptr)[i] = data[i];
														
 
															+    }
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_from(const vec_t<T, vec_size>& src) {
														
 
															+    cast_from_impl(*this, src);
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_load(const T* ptr) {
														
 
															+    cast_load_impl(*this, ptr);
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_store(T* ptr) const {
														
 
															+    cast_store_impl(ptr, *this);
														
 
															+  }
														
 
															+  APHRODITE_INLINE static void memcpy(__nv_fp8_e5m2* dst,
														
 
															+                                      const __nv_fp8_e5m2* src) {
														
 
															+#pragma unroll
														
 
															+    for (size_t i = 0; i < vec_size / 16; ++i) {
														
 
															+      ((uint4*)dst)[i] = ((uint4*)src)[i];
														
 
															+    }
														
 
															+  }
														
 
															+};
														
 
															+
														
 
															+/******************* vec_t<half> *******************/
														
 
															+
														
 
															+// half x 1
														
 
															+template <>
														
 
															+struct vec_t<half, 1> {
														
 
															+  half data;
														
 
															+
														
 
															+  APHRODITE_INLINE half& operator[](size_t i) { return ((half*)(&data))[i]; }
														
 
															+  APHRODITE_INLINE const half& operator[](size_t i) const {
														
 
															+    return ((const half*)(&data))[i];
														
 
															+  }
														
 
															+  APHRODITE_INLINE half* ptr() { return reinterpret_cast<half*>(&data); }
														
 
															+  APHRODITE_INLINE void fill(half val);
														
 
															+  APHRODITE_INLINE void load(const half* ptr);
														
 
															+  APHRODITE_INLINE void store(half* ptr) const;
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_from(const vec_t<T, 1>& src) {
														
 
															+    cast_from_impl(*this, src);
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_load(const T* ptr) {
														
 
															+    cast_load_impl(*this, ptr);
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_store(T* ptr) const {
														
 
															+    cast_store_impl(ptr, *this);
														
 
															+  }
														
 
															+
														
 
															+  APHRODITE_INLINE static void memcpy(half* dst, const half* src);
														
 
															+};
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<half, 1>::fill(half val) { data = val; }
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<half, 1>::load(const half* ptr) { data = *ptr; }
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<half, 1>::store(half* ptr) const { *ptr = data; }
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<half, 1>::memcpy(half* dst, const half* src) {
														
 
															+  *dst = *src;
														
 
															+}
														
 
															+
														
 
															+// half x 2
														
 
															+template <>
														
 
															+struct vec_t<half, 2> {
														
 
															+  half2 data;
														
 
															+
														
 
															+  APHRODITE_INLINE half& operator[](size_t i) { return ((half*)(&data))[i]; }
														
 
															+  APHRODITE_INLINE const half& operator[](size_t i) const {
														
 
															+    return ((const half*)(&data))[i];
														
 
															+  }
														
 
															+  APHRODITE_INLINE half* ptr() { return reinterpret_cast<half*>(&data); }
														
 
															+  APHRODITE_INLINE void fill(half val);
														
 
															+  APHRODITE_INLINE void load(const half* ptr);
														
 
															+  APHRODITE_INLINE void store(half* ptr) const;
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_from(const vec_t<T, 2>& src) {
														
 
															+    cast_from_impl(*this, src);
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_load(const T* ptr) {
														
 
															+    cast_load_impl(*this, ptr);
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_store(T* ptr) const {
														
 
															+    cast_store_impl(ptr, *this);
														
 
															+  }
														
 
															+
														
 
															+  APHRODITE_INLINE static void memcpy(half* dst, const half* src);
														
 
															+};
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<half, 2>::fill(half val) {
														
 
															+  data = make_half2(val, val);
														
 
															+}
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<half, 2>::load(const half* ptr) {
														
 
															+  data = *((half2*)ptr);
														
 
															+}
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<half, 2>::store(half* ptr) const {
														
 
															+  *((half2*)ptr) = data;
														
 
															+}
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<half, 2>::memcpy(half* dst, const half* src) {
														
 
															+  *((half2*)dst) = *((half2*)src);
														
 
															+}
														
 
															+
														
 
															+// half x 4
														
 
															+
														
 
															+template <>
														
 
															+struct vec_t<half, 4> {
														
 
															+  uint2 data;
														
 
															+
														
 
															+  APHRODITE_INLINE half& operator[](size_t i) { return ((half*)(&data))[i]; }
														
 
															+  APHRODITE_INLINE const half& operator[](size_t i) const {
														
 
															+    return ((const half*)(&data))[i];
														
 
															+  }
														
 
															+  APHRODITE_INLINE half* ptr() { return reinterpret_cast<half*>(&data); }
														
 
															+  APHRODITE_INLINE void fill(half val);
														
 
															+  APHRODITE_INLINE void load(const half* ptr);
														
 
															+  APHRODITE_INLINE void store(half* ptr) const;
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_from(const vec_t<T, 4>& src) {
														
 
															+    cast_from_impl(*this, src);
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_load(const T* ptr) {
														
 
															+    cast_load_impl(*this, ptr);
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_store(T* ptr) const {
														
 
															+    cast_store_impl(ptr, *this);
														
 
															+  }
														
 
															+  APHRODITE_INLINE static void memcpy(half* dst, const half* src);
														
 
															+};
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<half, 4>::fill(half val) {
														
 
															+  *(half2*)(&data.x) = make_half2(val, val);
														
 
															+  *(half2*)(&data.y) = make_half2(val, val);
														
 
															+}
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<half, 4>::load(const half* ptr) {
														
 
															+  data = *((uint2*)ptr);
														
 
															+}
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<half, 4>::store(half* ptr) const {
														
 
															+  *((uint2*)ptr) = data;
														
 
															+}
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<half, 4>::memcpy(half* dst, const half* src) {
														
 
															+  *((uint2*)dst) = *((uint2*)src);
														
 
															+}
														
 
															+
														
 
															+// half x 8 or more
														
 
															+
														
 
															+template <size_t vec_size>
														
 
															+struct vec_t<half, vec_size> {
														
 
															+  uint4 data[vec_size / 8];
														
 
															+  APHRODITE_INLINE half& operator[](size_t i) { return ((half*)data)[i]; }
														
 
															+  APHRODITE_INLINE const half& operator[](size_t i) const {
														
 
															+    return ((const half*)data)[i];
														
 
															+  }
														
 
															+  APHRODITE_INLINE half* ptr() { return reinterpret_cast<half*>(&data); }
														
 
															+  APHRODITE_INLINE void fill(half val) {
														
 
															+#pragma unroll
														
 
															+    for (size_t i = 0; i < vec_size / 8; ++i) {
														
 
															+      *(half2*)(&(data[i].x)) = make_half2(val, val);
														
 
															+      *(half2*)(&(data[i].y)) = make_half2(val, val);
														
 
															+      *(half2*)(&(data[i].z)) = make_half2(val, val);
														
 
															+      *(half2*)(&(data[i].w)) = make_half2(val, val);
														
 
															+    }
														
 
															+  }
														
 
															+  APHRODITE_INLINE void load(const half* ptr) {
														
 
															+#pragma unroll
														
 
															+    for (size_t i = 0; i < vec_size / 8; ++i) {
														
 
															+      data[i] = ((uint4*)ptr)[i];
														
 
															+    }
														
 
															+  }
														
 
															+  APHRODITE_INLINE void store(half* ptr) const {
														
 
															+#pragma unroll
														
 
															+    for (size_t i = 0; i < vec_size / 8; ++i) {
														
 
															+      ((uint4*)ptr)[i] = data[i];
														
 
															+    }
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_from(const vec_t<T, vec_size>& src) {
														
 
															+    cast_from_impl(*this, src);
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_load(const T* ptr) {
														
 
															+    cast_load_impl(*this, ptr);
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_store(T* ptr) const {
														
 
															+    cast_store_impl(ptr, *this);
														
 
															+  }
														
 
															+  APHRODITE_INLINE static void memcpy(half* dst, const half* src) {
														
 
															+#pragma unroll
														
 
															+    for (size_t i = 0; i < vec_size / 8; ++i) {
														
 
															+      ((uint4*)dst)[i] = ((uint4*)src)[i];
														
 
															+    }
														
 
															+  }
														
 
															+};
														
 
															+
														
 
															+/******************* vec_t<nv_bfloat16> *******************/
														
 
															+
														
 
															+// nv_bfloat16 x 1
														
 
															+template <>
														
 
															+struct vec_t<nv_bfloat16, 1> {
														
 
															+  nv_bfloat16 data;
														
 
															+  APHRODITE_INLINE nv_bfloat16& operator[](size_t i) {
														
 
															+    return ((nv_bfloat16*)(&data))[i];
														
 
															+  }
														
 
															+  APHRODITE_INLINE const nv_bfloat16& operator[](size_t i) const {
														
 
															+    return ((const nv_bfloat16*)(&data))[i];
														
 
															+  }
														
 
															+  APHRODITE_INLINE nv_bfloat16* ptr() {
														
 
															+    return reinterpret_cast<nv_bfloat16*>(&data);
														
 
															+  }
														
 
															+  APHRODITE_INLINE void fill(nv_bfloat16 val);
														
 
															+  APHRODITE_INLINE void load(const nv_bfloat16* ptr);
														
 
															+  APHRODITE_INLINE void store(nv_bfloat16* ptr) const;
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_from(const vec_t<T, 1>& src) {
														
 
															+    cast_from_impl(*this, src);
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_load(const T* ptr) {
														
 
															+    cast_load_impl(*this, ptr);
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_store(T* ptr) const {
														
 
															+    cast_store_impl(ptr, *this);
														
 
															+  }
														
 
															+  APHRODITE_INLINE static void memcpy(nv_bfloat16* dst, const nv_bfloat16* src);
														
 
															+};
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<nv_bfloat16, 1>::fill(nv_bfloat16 val) {
														
 
															+  data = val;
														
 
															+}
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<nv_bfloat16, 1>::load(const nv_bfloat16* ptr) {
														
 
															+  data = *ptr;
														
 
															+}
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<nv_bfloat16, 1>::store(nv_bfloat16* ptr) const {
														
 
															+  *ptr = data;
														
 
															+}
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<nv_bfloat16, 1>::memcpy(nv_bfloat16* dst,
														
 
															+                                                    const nv_bfloat16* src) {
														
 
															+  *dst = *src;
														
 
															+}
														
 
															+
														
 
															+// nv_bfloat16 x 2
														
 
															+template <>
														
 
															+struct vec_t<nv_bfloat16, 2> {
														
 
															+  nv_bfloat162 data;
														
 
															+
														
 
															+  APHRODITE_INLINE nv_bfloat16& operator[](size_t i) {
														
 
															+    return ((nv_bfloat16*)(&data))[i];
														
 
															+  }
														
 
															+  APHRODITE_INLINE const nv_bfloat16& operator[](size_t i) const {
														
 
															+    return ((const nv_bfloat16*)(&data))[i];
														
 
															+  }
														
 
															+  APHRODITE_INLINE nv_bfloat16* ptr() {
														
 
															+    return reinterpret_cast<nv_bfloat16*>(&data);
														
 
															+  }
														
 
															+  APHRODITE_INLINE void fill(nv_bfloat16 val);
														
 
															+  APHRODITE_INLINE void load(const nv_bfloat16* ptr);
														
 
															+  APHRODITE_INLINE void store(nv_bfloat16* ptr) const;
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_from(const vec_t<T, 2>& src) {
														
 
															+    cast_from_impl(*this, src);
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_load(const T* ptr) {
														
 
															+    cast_load_impl(*this, ptr);
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_store(T* ptr) const {
														
 
															+    cast_store_impl(ptr, *this);
														
 
															+  }
														
 
															+  APHRODITE_INLINE static void memcpy(nv_bfloat16* dst, const nv_bfloat16* src);
														
 
															+};
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<nv_bfloat16, 2>::fill(nv_bfloat16 val) {
														
 
															+  data = make_bfloat162(val, val);
														
 
															+}
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<nv_bfloat16, 2>::load(const nv_bfloat16* ptr) {
														
 
															+  data = *((nv_bfloat162*)ptr);
														
 
															+}
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<nv_bfloat16, 2>::store(nv_bfloat16* ptr) const {
														
 
															+  *((nv_bfloat162*)ptr) = data;
														
 
															+}
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<nv_bfloat16, 2>::memcpy(nv_bfloat16* dst,
														
 
															+                                                    const nv_bfloat16* src) {
														
 
															+  *((nv_bfloat162*)dst) = *((nv_bfloat162*)src);
														
 
															+}
														
 
															+
														
 
															+// nv_bfloat16 x 4
														
 
															+
														
 
															+template <>
														
 
															+struct vec_t<nv_bfloat16, 4> {
														
 
															+  uint2 data;
														
 
															+
														
 
															+  APHRODITE_INLINE nv_bfloat16& operator[](size_t i) {
														
 
															+    return ((nv_bfloat16*)(&data))[i];
														
 
															+  }
														
 
															+  APHRODITE_INLINE const nv_bfloat16& operator[](size_t i) const {
														
 
															+    return ((const nv_bfloat16*)(&data))[i];
														
 
															+  }
														
 
															+  APHRODITE_INLINE nv_bfloat16* ptr() {
														
 
															+    return reinterpret_cast<nv_bfloat16*>(&data);
														
 
															+  }
														
 
															+  APHRODITE_INLINE void fill(nv_bfloat16 val);
														
 
															+  APHRODITE_INLINE void load(const nv_bfloat16* ptr);
														
 
															+  APHRODITE_INLINE void store(nv_bfloat16* ptr) const;
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_from(const vec_t<T, 4>& src) {
														
 
															+    cast_from_impl(*this, src);
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_load(const T* ptr) {
														
 
															+    cast_load_impl(*this, ptr);
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_store(T* ptr) const {
														
 
															+    cast_store_impl(ptr, *this);
														
 
															+  }
														
 
															+  APHRODITE_INLINE static void memcpy(nv_bfloat16* dst, const nv_bfloat16* src);
														
 
															+};
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<nv_bfloat16, 4>::fill(nv_bfloat16 val) {
														
 
															+  *(nv_bfloat162*)(&data.x) = make_bfloat162(val, val);
														
 
															+  *(nv_bfloat162*)(&data.y) = make_bfloat162(val, val);
														
 
															+}
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<nv_bfloat16, 4>::load(const nv_bfloat16* ptr) {
														
 
															+  data = *((uint2*)ptr);
														
 
															+}
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<nv_bfloat16, 4>::store(nv_bfloat16* ptr) const {
														
 
															+  *((uint2*)ptr) = data;
														
 
															+}
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<nv_bfloat16, 4>::memcpy(nv_bfloat16* dst,
														
 
															+                                                    const nv_bfloat16* src) {
														
 
															+  *((uint2*)dst) = *((uint2*)src);
														
 
															+}
														
 
															+
														
 
															+// nv_bfloat16 x 8 or more
														
 
															+
														
 
															+template <size_t vec_size>
														
 
															+struct vec_t<nv_bfloat16, vec_size> {
														
 
															+  uint4 data[vec_size / 8];
														
 
															+
														
 
															+  APHRODITE_INLINE nv_bfloat16& operator[](size_t i) {
														
 
															+    return ((nv_bfloat16*)data)[i];
														
 
															+  }
														
 
															+  APHRODITE_INLINE const nv_bfloat16& operator[](size_t i) const {
														
 
															+    return ((const nv_bfloat16*)data)[i];
														
 
															+  }
														
 
															+  APHRODITE_INLINE nv_bfloat16* ptr() {
														
 
															+    return reinterpret_cast<nv_bfloat16*>(&data);
														
 
															+  }
														
 
															+  APHRODITE_INLINE void fill(nv_bfloat16 val) {
														
 
															+#pragma unoll
														
 
															+    for (size_t i = 0; i < vec_size / 8; ++i) {
														
 
															+      *(nv_bfloat162*)(&(data[i].x)) = make_bfloat162(val, val);
														
 
															+      *(nv_bfloat162*)(&(data[i].y)) = make_bfloat162(val, val);
														
 
															+      *(nv_bfloat162*)(&(data[i].z)) = make_bfloat162(val, val);
														
 
															+      *(nv_bfloat162*)(&(data[i].w)) = make_bfloat162(val, val);
														
 
															+    }
														
 
															+  }
														
 
															+  APHRODITE_INLINE void load(const nv_bfloat16* ptr) {
														
 
															+#pragma unoll
														
 
															+    for (size_t i = 0; i < vec_size / 8; ++i) {
														
 
															+      data[i] = ((uint4*)ptr)[i];
														
 
															+    }
														
 
															+  }
														
 
															+  APHRODITE_INLINE void store(nv_bfloat16* ptr) const {
														
 
															+#pragma unoll
														
 
															+    for (size_t i = 0; i < vec_size / 8; ++i) {
														
 
															+      ((uint4*)ptr)[i] = data[i];
														
 
															+    }
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_from(const vec_t<T, vec_size>& src) {
														
 
															+    cast_from_impl(*this, src);
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_load(const T* ptr) {
														
 
															+    cast_load_impl(*this, ptr);
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_store(T* ptr) const {
														
 
															+    cast_store_impl(ptr, *this);
														
 
															+  }
														
 
															+  APHRODITE_INLINE static void memcpy(nv_bfloat16* dst,
														
 
															+                                      const nv_bfloat16* src) {
														
 
															+#pragma unoll
														
 
															+    for (size_t i = 0; i < vec_size / 8; ++i) {
														
 
															+      ((uint4*)dst)[i] = ((uint4*)src)[i];
														
 
															+    }
														
 
															+  }
														
 
															+};
														
 
															+
														
 
															+/******************* vec_t<float> *******************/
														
 
															+
														
 
															+// float x 1
														
 
															+
														
 
															+template <>
														
 
															+struct vec_t<float, 1> {
														
 
															+  float data;
														
 
															+
														
 
															+  APHRODITE_INLINE float& operator[](size_t i) { return ((float*)(&data))[i]; }
														
 
															+  APHRODITE_INLINE const float& operator[](size_t i) const {
														
 
															+    return ((const float*)(&data))[i];
														
 
															+  }
														
 
															+  APHRODITE_INLINE float* ptr() { return reinterpret_cast<float*>(&data); }
														
 
															+  APHRODITE_INLINE void fill(float val);
														
 
															+  APHRODITE_INLINE void load(const float* ptr);
														
 
															+  APHRODITE_INLINE void store(float* ptr) const;
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_from(const vec_t<T, 1>& src) {
														
 
															+    cast_from_impl(*this, src);
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_load(const T* ptr) {
														
 
															+    cast_load_impl(*this, ptr);
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_store(T* ptr) const {
														
 
															+    cast_store_impl(ptr, *this);
														
 
															+  }
														
 
															+  APHRODITE_INLINE static void memcpy(float* dst, const float* src);
														
 
															+};
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<float, 1>::fill(float val) { data = val; }
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<float, 1>::load(const float* ptr) { data = *ptr; }
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<float, 1>::store(float* ptr) const { *ptr = data; }
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<float, 1>::memcpy(float* dst, const float* src) {
														
 
															+  *dst = *src;
														
 
															+}
														
 
															+
														
 
															+// float x 2
														
 
															+
														
 
															+template <>
														
 
															+struct vec_t<float, 2> {
														
 
															+  float2 data;
														
 
															+
														
 
															+  APHRODITE_INLINE float& operator[](size_t i) { return ((float*)(&data))[i]; }
														
 
															+  APHRODITE_INLINE const float& operator[](size_t i) const {
														
 
															+    return ((const float*)(&data))[i];
														
 
															+  }
														
 
															+  APHRODITE_INLINE float* ptr() { return reinterpret_cast<float*>(&data); }
														
 
															+  APHRODITE_INLINE void fill(float val);
														
 
															+  APHRODITE_INLINE void load(const float* ptr);
														
 
															+  APHRODITE_INLINE void store(float* ptr) const;
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_from(const vec_t<T, 2>& src) {
														
 
															+    cast_from_impl(*this, src);
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_load(const T* ptr) {
														
 
															+    cast_load_impl(*this, ptr);
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_store(T* ptr) const {
														
 
															+    cast_store_impl(ptr, *this);
														
 
															+  }
														
 
															+  APHRODITE_INLINE static void memcpy(float* dst, const float* src);
														
 
															+};
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<float, 2>::fill(float val) {
														
 
															+  data = make_float2(val, val);
														
 
															+}
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<float, 2>::load(const float* ptr) {
														
 
															+  data = *((float2*)ptr);
														
 
															+}
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<float, 2>::store(float* ptr) const {
														
 
															+  *((float2*)ptr) = data;
														
 
															+}
														
 
															+
														
 
															+APHRODITE_INLINE void vec_t<float, 2>::memcpy(float* dst, const float* src) {
														
 
															+  *((float2*)dst) = *((float2*)src);
														
 
															+}
														
 
															+
														
 
															+// float x 4 or more
														
 
															+template <size_t vec_size>
														
 
															+struct vec_t<float, vec_size> {
														
 
															+  float4 data[vec_size / 4];
														
 
															+
														
 
															+  APHRODITE_INLINE float& operator[](size_t i) { return ((float*)(data))[i]; }
														
 
															+  APHRODITE_INLINE const float& operator[](size_t i) const {
														
 
															+    return ((const float*)(data))[i];
														
 
															+  }
														
 
															+  APHRODITE_INLINE float* ptr() { return reinterpret_cast<float*>(&data); }
														
 
															+  APHRODITE_INLINE void fill(float val) {
														
 
															+#pragma unroll
														
 
															+    for (size_t i = 0; i < vec_size / 4; ++i) {
														
 
															+      data[i] = make_float4(val, val, val, val);
														
 
															+    }
														
 
															+  }
														
 
															+  APHRODITE_INLINE void load(const float* ptr) {
														
 
															+#pragma unroll
														
 
															+    for (size_t i = 0; i < vec_size / 4; ++i) {
														
 
															+      data[i] = ((float4*)ptr)[i];
														
 
															+    }
														
 
															+  }
														
 
															+  APHRODITE_INLINE void store(float* ptr) const {
														
 
															+#pragma unroll
														
 
															+    for (size_t i = 0; i < vec_size / 4; ++i) {
														
 
															+      ((float4*)ptr)[i] = data[i];
														
 
															+    }
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_from(const vec_t<T, vec_size>& src) {
														
 
															+    cast_from_impl(*this, src);
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_load(const T* ptr) {
														
 
															+    cast_load_impl(*this, ptr);
														
 
															+  }
														
 
															+  template <typename T>
														
 
															+  APHRODITE_INLINE void cast_store(T* ptr) const {
														
 
															+    cast_store_impl(ptr, *this);
														
 
															+  }
														
 
															+  APHRODITE_INLINE static void memcpy(float* dst, const float* src) {
														
 
															+#pragma unroll
														
 
															+    for (size_t i = 0; i < vec_size / 4; ++i) {
														
 
															+      ((float4*)dst)[i] = ((float4*)src)[i];
														
 
															+    }
														
 
															+  }
														
 
															+};
														
 
															+
														
 
															+}  // namespace aphrodite
														
 
															+
														
 
															+#endif  // VEC_DTYPES_CUH_
														
--- a/kernels/torch_bindings.cpp
+++ b/kernels/torch_bindings.cpp
@@ -207,6 +207,28 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 
															   ops.impl("fp_eXmY_linear_forward_cuda", torch::kCUDA,
														
 
															            &fp_eXmY_linear_forward_cuda);
														
 
															+  // Sampling Kernels
														
 
															+  ops.def("sampling_from_probs", &sampling_from_probs);
														
 
															+  ops.impl("sampling_from_probs", torch::kCUDA, &sampling_from_probs);
														
 
															+  ops.def("top_k_sampling_from_probs", &top_k_sampling_from_probs);
														
 
															+  ops.impl("top_k_sampling_from_probs", torch::kCUDA,
														
 
															+           &top_k_sampling_from_probs);
														
 
															+  ops.def("min_p_sampling_from_probs", &min_p_sampling_from_probs);
														
 
															+  ops.impl("min_p_sampling_from_probs", torch::kCUDA,
														
 
															+           &min_p_sampling_from_probs);
														
 
															+  ops.def("top_p_sampling_from_probs", &top_p_sampling_from_probs);
														
 
															+  ops.impl("top_p_sampling_from_probs", torch::kCUDA,
														
 
															+           &top_p_sampling_from_probs);
														
 
															+  ops.def("top_k_top_p_sampling_from_probs", &top_k_top_p_sampling_from_probs);
														
 
															+  ops.impl("top_k_top_p_sampling_from_probs", torch::kCUDA,
														
 
															+           &top_k_top_p_sampling_from_probs);
														
 
															+  ops.def("top_k_renorm_prob", &top_k_renorm_prob);
														
 
															+  ops.impl("top_k_renorm_prob", torch::kCUDA, &top_k_renorm_prob);
														
 
															+  ops.def("top_p_renorm_prob", &top_p_renorm_prob);
														
 
															+  ops.impl("top_p_renorm_prob", torch::kCUDA, &top_p_renorm_prob);
														
 
															+  ops.def("top_k_mask_logits", &top_k_mask_logits);
														
 
															+  ops.impl("top_k_mask_logits", torch::kCUDA, &top_k_mask_logits);
														
 
															+
														
 
															 #endif
														
 
															   // Quantized GEMM for GPTQ.
														
--- a/tests/benchmarks/engine/throughput.py
+++ b/tests/benchmarks/engine/throughput.py
@@ -75,6 +75,7 @@ def run_aphrodite(
 
															     dtype: str,
														
 
															     max_model_len: Optional[int],
														
 
															     enforce_eager: bool,
														
 
															+    max_seq_len_to_capture: int,
														
 
															     kv_cache_dtype: str,
														
 
															     quantization_param_path: Optional[str],
														
 
															     device: str,
														
@@ -100,6 +101,7 @@ def run_aphrodite(
 
															         max_model_len=max_model_len,
														
 
															         gpu_memory_utilization=gpu_memory_utilization,
														
 
															         enforce_eager=enforce_eager,
														
 
															+        max_seq_len_to_capture=max_seq_len_to_capture,
														
 
															         kv_cache_dtype=kv_cache_dtype,
														
 
															         quantization_param_path=quantization_param_path,
														
 
															         device=device,
														
@@ -233,8 +235,8 @@ def main(args: argparse.Namespace):
 
															             args.quant_llm_fp_bits,
														
 
															             args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
														
 
															             args.trust_remote_code, args.dtype, args.max_model_len,
														
 
															-            args.enforce_eager, args.kv_cache_dtype,
														
 
															-            args.quantization_param_path, args.device,
														
 
															+            args.enforce_eager, args.max_seq_len_to_capture,
														
 
															+            args.kv_cache_dtype, args.quantization_param_path, args.device,
														
 
															             args.enable_prefix_caching, args.enable_chunked_prefill,
														
 
															             args.max_num_batched_tokens, args.distributed_executor_backend,
														
 
															             args.gpu_memory_utilization, args.download_dir, args.load_format,
														
@@ -344,6 +346,11 @@ if __name__ == "__main__":
 
															     parser.add_argument("--enforce-eager",
														
 
															                         action="store_true",
														
 
															                         help="enforce eager execution")
														
 
															+    parser.add_argument("--max-seq-len-to-capture",
														
 
															+                        type=int,
														
 
															+                        default=None,
														
 
															+                        help="The maximum sequence length to capture for "
														
 
															+                        "CUDA graphs.")
														
 
															     parser.add_argument(
														
 
															         '--kv-cache-dtype',
														
 
															         type=str,