5 months ago · 73177656ed
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -199,6 +199,7 @@ if(APHRODITE_GPU_LANG STREQUAL "CUDA")
 
															   FetchContent_MakeAvailable(cutlass)
														
 
															   list(APPEND APHRODITE_EXT_SRC
														
 
															+    "kernels/quantization/fp6/fp6_linear.cu"
														
 
															     "kernels/mamba/mamba_ssm/selective_scan_fwd.cu"
														
 
															     "kernels/mamba/causal_conv1d/causal_conv1d.cu"
														
 
															     "kernels/quantization/aqlm/gemm_kernels.cu"
														
--- a/aphrodite/_custom_ops.py
+++ b/aphrodite/_custom_ops.py
@@ -465,6 +465,20 @@ def ggml_mul_mat_a8(
 
															     return torch.ops._C.ggml_mul_mat_a8(W, X, quant_type, row)
														
 
															+# fp6
														
 
															+def fp_eXmY_linear_forward_cuda(
														
 
															+    EXPONENT: int,
														
 
															+    MANTISSA: int,
														
 
															+    _in_feats: torch.Tensor,
														
 
															+    _weights: torch.Tensor,
														
 
															+    _scales: torch.Tensor,
														
 
															+    splitK: int = 1,
														
 
															+) -> torch.Tensor:
														
 
															+    return torch.ops._C.fp_eXmY_linear_forward_cuda(EXPONENT, MANTISSA,
														
 
															+                                                    _in_feats, _weights,
														
 
															+                                                    _scales, splitK)
														
 
															+
														
 
															+
														
 
															 # mamba
														
 
															 def causal_conv1d_fwd(x: torch.Tensor, weight: torch.Tensor,
														
 
															                       bias_: Optional[torch.Tensor],
														
--- a/aphrodite/common/config.py
+++ b/aphrodite/common/config.py
@@ -48,6 +48,12 @@ _PP_SUPPORTED_MODELS = [
 
															 ]
														
 
															 _OPTIMIZED_QUANTS = [
														
 
															+    "fp2",
														
 
															+    "fp3",
														
 
															+    "fp4",
														
 
															+    "fp5",
														
 
															+    "fp6",
														
 
															+    "fp7",
														
 
															     "fp8",
														
 
															     "marlin",
														
 
															     "gptq_marlin_24",
														
@@ -57,6 +63,7 @@ _OPTIMIZED_QUANTS = [
 
															     "compressed-tensors",
														
 
															     "compressed_tensors",
														
 
															     "experts_int8",
														
 
															+    "quant_llm",
														
 
															 ]
														
@@ -95,6 +102,8 @@ class ModelConfig:
 
															             weights. If None, we assume the model weights are not quantized.
														
 
															         deepspeed_fp_bits: Number of bits to use for DeepSpeed FP quantization.
														
 
															             Supported number of bits are: 4, 6, 8, 12.
														
 
															+        quant_llm_fp_bits: Number of bits to use for QuantLLM FP quantization.
														
 
															+            Supported number of bits are: 5, 6, 7.
														
 
															         quantization_param_path: Path to JSON file containing scaling factors.
														
 
															             Used to load KV cache scaling factors into the model when KV cache
														
 
															             type is FP8_E4M3 on ROCm (AMD GPU). In the future these will also
														
@@ -142,6 +151,8 @@ class ModelConfig:
 
															         max_model_len: Optional[int] = None,
														
 
															         quantization: Optional[str] = None,
														
 
															         deepspeed_fp_bits: Optional[int] = None,
														
 
															+        quant_llm_fp_bits: Optional[int] = None,
														
 
															+        quant_llm_exp_bits: Optional[int] = None,
														
 
															         quantization_param_path: Optional[str] = None,
														
 
															         enforce_eager: Optional[bool] = None,
														
 
															         max_context_len_to_capture: Optional[int] = None,
														
@@ -168,6 +179,8 @@ class ModelConfig:
 
															             self.tokenizer_revision = tokenizer_revision
														
 
															         self.quantization = quantization
														
 
															         self.deepspeed_fp_bits = deepspeed_fp_bits
														
 
															+        self.quant_llm_fp_bits = quant_llm_fp_bits
														
 
															+        self.quant_llm_exp_bits = quant_llm_exp_bits
														
 
															         self.quantization_param_path = quantization_param_path
														
 
															         self.enforce_eager = enforce_eager
														
 
															         self.max_context_len_to_capture = max_context_len_to_capture
														
@@ -316,6 +329,68 @@ class ModelConfig:
 
															                 "quant_method": "deepspeedfp"
														
 
															             }
														
 
															+        VALID_QUANT_LLM_FP_BITS = [2, 3, 4, 5, 6, 7]
														
 
															+        VALID_QUANT_LLM_EXPONENTS = [1, 2, 3, 4, 5]
														
 
															+        # The formula is mantissa_bits = fp_bits - exp_bits - 1
														
 
															+        # The default exp_bits for each fp_bits are as follows:
														
 
															+        DEFAULT_EXP_BITS = {
														
 
															+            2: 1,
														
 
															+            3: 2,
														
 
															+            4: 2,
														
 
															+            5: 2,
														
 
															+            6: 2,
														
 
															+            7: 3,
														
 
															+        }
														
 
															+
														
 
															+        if self.quantization == "quant_llm":
														
 
															+            if self.quant_llm_fp_bits is None:
														
 
															+                raise ValueError(
														
 
															+                    "quant_llm_fp_bits must be specified when using "
														
 
															+                    "quant_llm quantization."
														
 
															+                )
														
 
															+            if self.quant_llm_fp_bits not in VALID_QUANT_LLM_FP_BITS:
														
 
															+                raise ValueError(
														
 
															+                    f"Invalid quant_llm_fp_bits: {self.quant_llm_fp_bits}. "
														
 
															+                    f"Must be one of {VALID_QUANT_LLM_FP_BITS}."
														
 
															+                )
														
 
															+            if self.quant_llm_exp_bits is None:
														
 
															+                self.quant_llm_exp_bits = DEFAULT_EXP_BITS[
														
 
															+                    self.quant_llm_fp_bits]
														
 
															+            else:
														
 
															+                if self.quant_llm_exp_bits not in VALID_QUANT_LLM_EXPONENTS:
														
 
															+                    raise ValueError(
														
 
															+                        f"Invalid exponent bits: {self.quant_llm_exp_bits}. "
														
 
															+                        f"Must be one of {VALID_QUANT_LLM_EXPONENTS}."
														
 
															+                    )
														
 
															+
														
 
															+            self.hf_config.quantization_config = {
														
 
															+                "bits": self.quant_llm_fp_bits,
														
 
															+                "exp_bits": self.quant_llm_exp_bits,
														
 
															+                "quant_method": "quant_llm"
														
 
															+            }
														
 
															+            
														
 
															+        online_quant_methods = ["fp2", "fp3", "fp4", "fp5", "fp6", "fp7"]
														
 
															+        if self.quantization is not None and self.quantization in \
														
 
															+            online_quant_methods:
														
 
															+            fp_bits = int(self.quantization[2])
														
 
															+            if fp_bits not in VALID_QUANT_LLM_FP_BITS:
														
 
															+                raise ValueError(
														
 
															+                    f"Invalid quant_llm_fp_bits: {fp_bits}. "
														
 
															+                    f"Must be one of {VALID_QUANT_LLM_FP_BITS}."
														
 
															+                )
														
 
															+            if fp_bits in [2, 3]:
														
 
															+                logger.warning("FP2 and FP3 quantization methods lead to "
														
 
															+                               "significant accuracy loss. Use them with "
														
 
															+                               "caution. Model may be incoherent.")
														
 
															+            exp_bits = DEFAULT_EXP_BITS[fp_bits]
														
 
															+            self.hf_config.quantization_config = {
														
 
															+                "bits": fp_bits,
														
 
															+                "exp_bits": exp_bits,
														
 
															+                "quant_method": self.quantization
														
 
															+            }
														
 
															+            self.dtype = torch.float16
														
 
															+            self.enforce_eager = True
														
 
															+
														
 
															         if self.quantization is not None:
														
 
															             if self.quantization not in supported_quantization:
														
 
															                 raise ValueError(
														
--- a/aphrodite/engine/args_tools.py
+++ b/aphrodite/engine/args_tools.py
@@ -94,6 +94,8 @@ class EngineArgs:
 
															     quantization_param_path: Optional[str] = None
														
 
															     preemption_mode: Optional[str] = None
														
 
															     deepspeed_fp_bits: Optional[int] = None
														
 
															+    quant_llm_fp_bits: Optional[int] = None
														
 
															+    quant_llm_exp_bits: Optional[int] = None
														
 
															     # Cache Options
														
 
															     kv_cache_dtype: str = "auto"
														
 
															     block_size: int = 16
														
@@ -498,8 +500,20 @@ class EngineArgs:
 
															                             type=int,
														
 
															                             default=None,
														
 
															                             help="Category: Quantization Options\n"
														
 
															-                            "Number of floating bits to use for the deepseed "
														
 
															-                            "quantization. Supported bits are: 4, 6, 8, 12. ")
														
 
															+                            "Number of floating bits to use for the deepspeed "
														
 
															+                            "quantization. Supported bits are: 4, 6, 8, 12.")
														
 
															+        parser.add_argument("--quant-llm-fp-bits",
														
 
															+                            type=int,
														
 
															+                            default=None,
														
 
															+                            help="Category: Quantization Options\n"
														
 
															+                            "Number of floating bits to use for the quant_llm "
														
 
															+                            "quantization. Supported bits are: 4 to 15.")
														
 
															+        parser.add_argument("--quant-llm-exp-bits",
														
 
															+                            type=int,
														
 
															+                            default=None,
														
 
															+                            help="Category: Quantization Options\n"
														
 
															+                            "Number of exponent bits to use for the quant_llm "
														
 
															+                            "quantization. Supported bits are: 1 to 5.")
														
 
															         # Cache Options
														
 
															         parser.add_argument(
														
 
															             '--kv-cache-dtype',
														
@@ -886,6 +900,8 @@ class EngineArgs:
 
															             max_model_len=self.max_model_len,
														
 
															             quantization=self.quantization,
														
 
															             deepspeed_fp_bits=self.deepspeed_fp_bits,
														
 
															+            quant_llm_fp_bits=self.quant_llm_fp_bits,
														
 
															+            quant_llm_exp_bits=self.quant_llm_exp_bits,
														
 
															             quantization_param_path=self.quantization_param_path,
														
 
															             enforce_eager=self.enforce_eager,
														
 
															             max_context_len_to_capture=self.max_context_len_to_capture,
														
--- a/aphrodite/quantization/__init__.py
+++ b/aphrodite/quantization/__init__.py
@@ -11,6 +11,7 @@ from aphrodite.quantization.deepspeedfp import DeepSpeedFPConfig
 
															 from aphrodite.quantization.eetq import EETQConfig
														
 
															 from aphrodite.quantization.experts_int8 import ExpertsInt8Config
														
 
															 from aphrodite.quantization.fbgemm_fp8 import FBGEMMFp8Config
														
 
															+from aphrodite.quantization.fp6 import QuantLLMFPConfig
														
 
															 from aphrodite.quantization.fp8 import Fp8Config
														
 
															 from aphrodite.quantization.gguf import GGUFConfig
														
 
															 from aphrodite.quantization.gptq import GPTQConfig
														
@@ -29,6 +30,7 @@ QUANTIZATION_METHODS = {
 
															     "tpu_int8": Int8TpuConfig,
														
 
															     "eetq": EETQConfig,
														
 
															     "fp8": Fp8Config,
														
 
															+    "quant_llm": QuantLLMFPConfig,
														
 
															     "fbgemm_fp8": FBGEMMFp8Config,
														
 
															     "gguf": GGUFConfig,
														
 
															     # The order of gptq methods is important for config.py iteration over
														
@@ -44,6 +46,13 @@ QUANTIZATION_METHODS = {
 
															     "bitsandbytes": BitsAndBytesConfig,
														
 
															     "qqq": QQQConfig,
														
 
															     "experts_int8": ExpertsInt8Config,
														
 
															+    # the quant_llm methods
														
 
															+    "fp2": QuantLLMFPConfig,
														
 
															+    "fp3": QuantLLMFPConfig,
														
 
															+    "fp4": QuantLLMFPConfig,
														
 
															+    "fp5": QuantLLMFPConfig,
														
 
															+    "fp6": QuantLLMFPConfig,
														
 
															+    "fp7": QuantLLMFPConfig,
														
 
															 }
														
--- a/aphrodite/quantization/fp6.py
+++ b/aphrodite/quantization/fp6.py
@@ -0,0 +1,198 @@
 
															+from typing import Any, Dict, List, Optional
														
 
															+
														
 
															+import torch
														
 
															+import torch.nn as nn
														
 
															+from loguru import logger
														
 
															+
														
 
															+from aphrodite import _custom_ops as ops
														
 
															+from aphrodite.distributed import get_tensor_model_parallel_rank
														
 
															+from aphrodite.modeling.layers.linear import LinearBase, LinearMethodBase
														
 
															+from aphrodite.modeling.utils import set_weight_attrs
														
 
															+from aphrodite.quantization.base_config import QuantizationConfig
														
 
															+from aphrodite.quantization.utils.fp6_utils import (_SPLIT_K_MAP,
														
 
															+                                                    from_scaled_tc_fpx,
														
 
															+                                                    to_scaled_tc_fpx)
														
 
															+
														
 
															+
														
 
															+class QuantLLMFPConfig(QuantizationConfig):
														
 
															+    """Config for QuantLLM FP quantizer. It supports fp2, fp3, fp4,
														
 
															+    fp5, fp6, fp7.
														
 
															+    
														
 
															+    Reference: https://arxiv.org/abs/2401.14112
														
 
															+    
														
 
															+    Args: 
														
 
															+        weight_bits: the target quantization bits, should be one of
														
 
															+            2, 3, 4, 5, 6, 7.
														
 
															+    """
														
 
															+
														
 
															+    def __init__(
														
 
															+        self,
														
 
															+        weight_bits: int = 6,
														
 
															+        exp_bits: int = 2,
														
 
															+    ) -> None:
														
 
															+        self.weight_bits = weight_bits
														
 
															+        self.exponent_bits = exp_bits
														
 
															+
														
 
															+        self.mantissa_bits = weight_bits - self.exponent_bits - 1
														
 
															+
														
 
															+        self.valid_types = [torch.float16]
														
 
															+
														
 
															+        if self.weight_bits not in [2, 3, 4, 5, 6, 7]:
														
 
															+            raise ValueError(
														
 
															+                "Currently, only 4-bit, 5-bit, 6-bit, and 7-bit "
														
 
															+                "quantization are "
														
 
															+                f"supported for QuantLLM FP quantizaiton, but got "
														
 
															+                f"{self.weight_bits} bits.")
														
 
															+        
														
 
															+        if get_tensor_model_parallel_rank() == 0:
														
 
															+            logger.info(f"Loading model in FP{self.weight_bits}_E"
														
 
															+                        f"{self.exponent_bits}M{self.mantissa_bits} format.")
														
 
															+
														
 
															+    def __repr__(self) -> str:
														
 
															+        return (f"QuantLLMFPConfig(weight_bits={self.weight_bits}), "
														
 
															+                f"exponent_bits={self.exponent_bits}")
														
 
															+
														
 
															+    @classmethod
														
 
															+    def get_name(cls) -> str:
														
 
															+        return "QuantLLMFP"
														
 
															+
														
 
															+    @classmethod
														
 
															+    def from_config(cls, config: Dict[str, Any]) -> "QuantLLMFPConfig":
														
 
															+        weight_bits = cls.get_from_keys(config, ["bits"])
														
 
															+        exp_bits = cls.get_from_keys(config, ["exp_bits"])
														
 
															+        return cls(weight_bits=weight_bits, exp_bits=exp_bits)
														
 
															+
														
 
															+    def get_linear_method(self) -> "QuantLLMFPLinearMethod":
														
 
															+        return QuantLLMFPLinearMethod(self)
														
 
															+
														
 
															+    def get_scaled_act_names(self) -> List[str]:
														
 
															+        return []
														
 
															+
														
 
															+    @classmethod
														
 
															+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
														
 
															+        return [torch.half]
														
 
															+
														
 
															+    @classmethod
														
 
															+    # Need to figure it out
														
 
															+    def get_min_capability(cls) -> int:
														
 
															+        return 80
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def get_config_filenames() -> List[str]:
														
 
															+        return [
														
 
															+            "quant_config.json",
														
 
															+            "quantize_config.json",
														
 
															+        ]
														
 
															+
														
 
															+    def get_quant_method(
														
 
															+            self,
														
 
															+            layer: torch.nn.Module,
														
 
															+            prefix: str) -> Optional["QuantLLMFPLinearMethod"]:
														
 
															+        if isinstance(layer, LinearBase):
														
 
															+            return QuantLLMFPLinearMethod(self)
														
 
															+        return None
														
 
															+
														
 
															+
														
 
															+class QuantLLMFPLinearMethod(LinearMethodBase):
														
 
															+    """Linear method for QuantLLMFP quantizer.
														
 
															+    Args:
														
 
															+        quant_config: the QuantLLMFP quantization config.
														
 
															+    """
														
 
															+
														
 
															+    def __init__(self, quant_config: QuantLLMFPConfig):
														
 
															+        self.quant_config = quant_config
														
 
															+        self.weight = None
														
 
															+
														
 
															+    def create_weights(self,
														
 
															+                       layer: torch.nn.Module,
														
 
															+                       input_size_per_partition: int,
														
 
															+                       output_partition_sizes: List[int],
														
 
															+                       input_size: int,
														
 
															+                       output_size: int,
														
 
															+                       params_dtype: torch.dtype,
														
 
															+                       weight_loader=None,
														
 
															+                       **extra_weight_attrs):
														
 
															+        del output_size
														
 
															+        del input_size
														
 
															+        output_size_per_partition = sum(output_partition_sizes)
														
 
															+        weight = QuantLLMFPParameter(
														
 
															+            torch.Size((output_size_per_partition, input_size_per_partition)),
														
 
															+            params_dtype=params_dtype,
														
 
															+            quant_config=self.quant_config,
														
 
															+        )
														
 
															+        set_weight_attrs(weight, {
														
 
															+            "input_dim": 1,
														
 
															+            "output_dim": 0,
														
 
															+        })
														
 
															+        layer.register_parameter("weight", weight)
														
 
															+
														
 
															+        def quant_weight_loader(param, loaded_weight, *args, **kwargs):
														
 
															+            # Calls the original weight loader (if any), quantizes the result,
														
 
															+            # and then loads the quantized parameter.
														
 
															+            if weight_loader is not None:
														
 
															+                orig_param_data = param.data
														
 
															+                param.data = param.quant_llmdequantize()
														
 
															+                weight_loader(param, loaded_weight, *args, **kwargs)
														
 
															+                param.data, loaded_weight = orig_param_data, param.data
														
 
															+            param.quant_llmquantize_(loaded_weight.cuda())
														
 
															+
														
 
															+        extra_weight_attrs["weight_loader"] = quant_weight_loader
														
 
															+        set_weight_attrs(weight, extra_weight_attrs)
														
 
															+
														
 
															+    def apply(self,
														
 
															+              layer,
														
 
															+              x: torch.Tensor,
														
 
															+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
														
 
															+        weight = layer.weight
														
 
															+        weights = weight.data
														
 
															+        scales = weight.scales
														
 
															+        out_dim, in_dim = weights.shape
														
 
															+        bsize = x.shape[0]
														
 
															+        splitK = _SPLIT_K_MAP[(bsize - 1) // 64].get(
														
 
															+            out_dim, 1) if bsize <= 768 else 1
														
 
															+        if bias is None:
														
 
															+            return ops.fp_eXmY_linear_forward_cuda(
														
 
															+                self.quant_config.exponent_bits,
														
 
															+                self.quant_config.mantissa_bits,
														
 
															+                x, weights, scales, splitK)
														
 
															+        else:
														
 
															+            return ops.fp_eXmY_linear_forward_cuda(
														
 
															+                self.quant_config.exponent_bits,
														
 
															+                self.quant_config.mantissa_bits,
														
 
															+                x, weights, scales, splitK) + bias
														
 
															+
														
 
															+class QuantLLMFPParameter(nn.Parameter):
														
 
															+    """
														
 
															+    QuantLLMFP quantized parameter class that implements fp5/fp6/fp7
														
 
															+    quantization. Weights are stored in quantized form on
														
 
															+    GPUs, and can be directly applied to float16 activations.
														
 
															+    """
														
 
															+
														
 
															+    def __new__(cls, orig_shape: torch.Size, params_dtype: torch.dtype,
														
 
															+                quant_config: QuantLLMFPConfig):
														
 
															+
														
 
															+        data = torch.empty(torch.Size((orig_shape[0],
														
 
															+                            orig_shape[1] * quant_config.weight_bits // 8)),
														
 
															+                                   dtype=torch.uint8)
														
 
															+
														
 
															+
														
 
															+        self = torch.Tensor._make_subclass(cls, data, data.requires_grad)
														
 
															+        self.scales = torch.empty(orig_shape[0],
														
 
															+                                  dtype=torch.float16)
														
 
															+        self.quant_config = quant_config
														
 
															+        self.orig_shape = orig_shape
														
 
															+        return self
														
 
															+
														
 
															+    def quant_llmquantize_(self, tensor: torch.Tensor):
														
 
															+        assert tensor.device.type == "cuda" and tensor.dtype != torch.int8
														
 
															+        data, scales = to_scaled_tc_fpx(
														
 
															+            tensor.data, self.quant_config.exponent_bits,
														
 
															+            self.quant_config.mantissa_bits)
														
 
															+        self.data.copy_(data)
														
 
															+        self.scales.copy_(scales)
														
 
															+
														
 
															+    def quant_llmdequantize(self, output_dtype=None):
														
 
															+        output_dtype = output_dtype or torch.get_default_dtype()
														
 
															+        return from_scaled_tc_fpx(self.data, self.quant_config.exponent_bits, 
														
 
															+                        self.quant_config.mantissa_bits, self.scales
														
 
															+                        ).to(output_dtype)
														
--- a/aphrodite/quantization/utils/fp6_utils.py
+++ b/aphrodite/quantization/utils/fp6_utils.py
@@ -0,0 +1,585 @@
 
															+# ruff: noqa
														
 
															+# Copyright (c) Meta Platforms, Inc. and affiliates.
														
 
															+# All rights reserved.
														
 
															+
														
 
															+# This source code is licensed under the license found in the
														
 
															+# LICENSE file in the root directory of this source tree.
														
 
															+
														
 
															+# This script was initially developed for sub-byte MX dtypes (FP4 E2M1, FP6 E3M2, and FP6 E2M3).
														
 
															+# It has been refactored to support any sub-byte FP dtypes. However, some behaviors of MX dtypes remain:
														
 
															+#   1. No encodings are reserved for special values (+/-inf, NaN).
														
 
															+#   2. When downcasting from FP32 to FPx,
														
 
															+#      - Rounding mode is round to nearest, ties to even.
														
 
															+#      - Values outside the representable range of FPx after rounding are clamped to the maximum FPx
														
 
															+#      magnitude (sign is preserved).
														
 
															+from functools import reduce
														
 
															+from typing import Tuple
														
 
															+
														
 
															+import torch
														
 
															+from torch import Tensor
														
 
															+
														
 
															+
														
 
															+def _n_ones(n: int) -> int:
														
 
															+    return (1 << n) - 1
														
 
															+
														
 
															+
														
 
															+EBITS_F32, MBITS_F32 = 8, 23
														
 
															+F32_EXP_BIAS = _n_ones(EBITS_F32 - 1)
														
 
															+
														
 
															+# https://github.com/microsoft/DeepSpeed/blob/3a3a6db3332e339cc9fd94efd4982f6d60635a3d/deepspeed/inference/v2/kernels/core_ops/cuda_linear/cuda_linear.py
														
 
															+_SPLIT_K_MAP = [
														
 
															+    {  # tokens: [1, 64]
														
 
															+        3072: 18,
														
 
															+        4096: 13,
														
 
															+        5120: 10,
														
 
															+        6144: 9,
														
 
															+        8192: 6,
														
 
															+        10240: 5,
														
 
															+        14336: 7,
														
 
															+        28672: 7,
														
 
															+        57344: 7
														
 
															+    },
														
 
															+    {  # tokens: [65:128]
														
 
															+        3072: 9,
														
 
															+        4096: 6,
														
 
															+        5120: 5,
														
 
															+        6144: 9,
														
 
															+        8192: 3,
														
 
															+        10240: 5,
														
 
															+        14336: 7,
														
 
															+        28672: 7,
														
 
															+        57344: 6
														
 
															+    },
														
 
															+    {  # tokens: [129:192]
														
 
															+        3072: 6,
														
 
															+        4096: 4,
														
 
															+        5120: 7,
														
 
															+        6144: 3,
														
 
															+        8192: 2,
														
 
															+        10240: 5,
														
 
															+        14336: 5,
														
 
															+        28672: 5,
														
 
															+        57344: 4
														
 
															+    },
														
 
															+    {  # tokens: [193:256]
														
 
															+        3072: 9,
														
 
															+        4096: 3,
														
 
															+        5120: 5,
														
 
															+        6144: 2,
														
 
															+        8192: 5,
														
 
															+        10240: 4,
														
 
															+        14336: 8,
														
 
															+        28672: 6,
														
 
															+        57344: 4
														
 
															+    },
														
 
															+    {  # tokens: [257:320]
														
 
															+        3072: 7,
														
 
															+        4096: 5,
														
 
															+        5120: 2,
														
 
															+        6144: 5,
														
 
															+        8192: 4,
														
 
															+        10240: 1,
														
 
															+        14336: 3,
														
 
															+        28672: 3,
														
 
															+        57344: 4
														
 
															+    },
														
 
															+    {  # tokens: [321:384]
														
 
															+        3072: 3,
														
 
															+        4096: 2,
														
 
															+        5120: 5,
														
 
															+        6144: 3,
														
 
															+        8192: 1,
														
 
															+        10240: 8,
														
 
															+        14336: 3,
														
 
															+        28672: 4,
														
 
															+        57344: 3
														
 
															+    },
														
 
															+    {  # tokens: [385:448]
														
 
															+        3072: 5,
														
 
															+        4096: 7,
														
 
															+        5120: 3,
														
 
															+        6144: 5,
														
 
															+        8192: 7,
														
 
															+        10240: 3,
														
 
															+        14336: 1,
														
 
															+        28672: 1,
														
 
															+        57344: 3
														
 
															+    },
														
 
															+    {  # tokens: [449:512]
														
 
															+        3072: 2,
														
 
															+        4096: 5,
														
 
															+        5120: 4,
														
 
															+        6144: 1,
														
 
															+        8192: 5,
														
 
															+        10240: 2,
														
 
															+        14336: 6,
														
 
															+        28672: 4,
														
 
															+        57344: 1
														
 
															+    },
														
 
															+    {  # tokens: [513:576]
														
 
															+        3072: 2,
														
 
															+        4096: 3,
														
 
															+        5120: 1,
														
 
															+        6144: 1,
														
 
															+        8192: 3,
														
 
															+        10240: 3,
														
 
															+        14336: 3,
														
 
															+        28672: 1,
														
 
															+        57344: 1
														
 
															+    },
														
 
															+    {  # tokens: [577:640]
														
 
															+        3072: 5,
														
 
															+        4096: 4,
														
 
															+        5120: 1,
														
 
															+        6144: 4,
														
 
															+        8192: 2,
														
 
															+        10240: 1,
														
 
															+        14336: 1,
														
 
															+        28672: 1,
														
 
															+        57344: 1
														
 
															+    },
														
 
															+    {  # tokens: [641:704]
														
 
															+        3072: 3,
														
 
															+        4096: 1,
														
 
															+        5120: 2,
														
 
															+        6144: 2,
														
 
															+        8192: 1,
														
 
															+        10240: 2,
														
 
															+        14336: 1,
														
 
															+        28672: 1,
														
 
															+        57344: 1
														
 
															+    },
														
 
															+    {  # tokens: [705:768]
														
 
															+        3072: 3,
														
 
															+        4096: 1,
														
 
															+        5120: 3,
														
 
															+        6144: 2,
														
 
															+        8192: 1,
														
 
															+        10240: 1,
														
 
															+        14336: 1,
														
 
															+        28672: 1,
														
 
															+        57344: 1
														
 
															+    }
														
 
															+]
														
 
															+
														
 
															+
														
 
															+def _f32_to_fpx_unpacked(x: Tensor, ebits: int, mbits: int) -> Tensor:
														
 
															+    """Convert FP32 numbers to sub-byte floating point numbers with the given
														
 
															+    number of exponent and mantissa bits.
														
 
															+    Input: torch.Tensor of dtype torch.float
														
 
															+    Output: torch.Tensor of dtype torch.uint8, where the bit encoding is stored
														
 
															+    in the least significant bits. e.g.
														
 
															+      fp4: bits 0-3 empty and bits 4-7 in fp4_e2m1 encoding
														
 
															+      fp6: bits 0-1 empty and bits 2-7 in fp6_e2m3 or fp6_e3m2 encoding
														
 
															+    Note: there are no special values (NaN, inf) support in this code. Values
														
 
															+    outside the representable range of FPx after rounding are clamped to the
														
 
															+    maximum FPx magnitude (sign is preserved).
														
 
															+    Code below is an adaptation of https://fburl.com/code/ciwofcg4
														
 
															+    Background 1: last answer in https://stackoverflow.com/questions/8981913/how-to-perform-round-to-even-with-floating-point-numbers  # noqa: E501
														
 
															+    Background 2: Computer Organization and Design, RISC-V edition, Chapter 3.5
														
 
															+    """
														
 
															+    assert x.dtype == torch.float
														
 
															+    assert 1 + ebits + mbits <= 8
														
 
															+
														
 
															+    # calculate constants
														
 
															+    exp_bias = _n_ones(ebits - 1)
														
 
															+    max_int = _n_ones(ebits + mbits)
														
 
															+    sign_mask = 1 << (ebits + mbits)
														
 
															+
														
 
															+    # TODO document this better
														
 
															+    magic_adder = _n_ones(MBITS_F32 - mbits - 1)
														
 
															+
														
 
															+    # all E bits and M bits are 1s
														
 
															+    max_normal = 2 ** (_n_ones(ebits) - exp_bias) * (_n_ones(mbits + 1) / (2 ** mbits))
														
 
															+
														
 
															+    # E bits = 1, M bits = 0
														
 
															+    min_normal = 2 ** (1 - exp_bias)
														
 
															+
														
 
															+    denorm_exp = (
														
 
															+        # exp bias conversion between formats
														
 
															+        (F32_EXP_BIAS - exp_bias)
														
 
															+        # mantissa length difference between formats
														
 
															+        + (MBITS_F32 - mbits)
														
 
															+        # add one to encoded exponent for denormalized numbers
														
 
															+        + 1
														
 
															+    )
														
 
															+    denorm_mask_int = denorm_exp << MBITS_F32
														
 
															+
														
 
															+    # reinterpret int32 as float32
														
 
															+    denorm_mask_float = torch.tensor(denorm_mask_int, dtype=torch.int32).view(torch.float32)
														
 
															+
														
 
															+    # save the sign
														
 
															+    # Note that we have torch.uint32, but some ops like cpu bit shifts
														
 
															+    # do not work on it. So, we stay in int32.
														
 
															+    x = x.view(torch.int32)
														
 
															+    sign = x & 0x80000000
														
 
															+
														
 
															+    # set everything to positive, will add sign back at the end
														
 
															+    x = x ^ sign
														
 
															+
														
 
															+    # TODO: can the branch floating point comparisons below be done without
														
 
															+    # converting to float? probably but need to verify
														
 
															+    x = x.view(torch.float)
														
 
															+
														
 
															+    # rewrite saturate/denorm/norm branches without explicit data dependent
														
 
															+    # control flow, to be more compiler friendly
														
 
															+    saturate_mask = x >= max_normal
														
 
															+    denormal_mask = torch.logical_and(torch.logical_not(saturate_mask), x < min_normal)
														
 
															+    normal_mask = torch.logical_not(torch.logical_or(saturate_mask, denormal_mask))
														
 
															+
														
 
															+    #
														
 
															+    # branch 1: saturate to max val - handled later in the code which combines
														
 
															+    #   the branches
														
 
															+    #
														
 
															+
														
 
															+    #
														
 
															+    # branch 2: to conversion to denormal as well as rounding up to normal
														
 
															+    #
														
 
															+    denormal_x = x + denorm_mask_float
														
 
															+    denormal_x = denormal_x.view(torch.int32)
														
 
															+    denormal_x -= denorm_mask_int
														
 
															+    denormal_x = denormal_x.to(torch.uint8)
														
 
															+
														
 
															+    #
														
 
															+    # branch 3: stay in normal range, adjust the exponent and round
														
 
															+    #
														
 
															+    normal_x = x.view(torch.int32)
														
 
															+    # resulting mantissa is odd
														
 
															+    mant_odd = (normal_x >> (MBITS_F32 - mbits)) & 1
														
 
															+    # update exponent, rounding bias part 1
														
 
															+    val_to_add = ((exp_bias - F32_EXP_BIAS) << MBITS_F32) + magic_adder
														
 
															+    normal_x += val_to_add
														
 
															+    # rounding bias part 2
														
 
															+    normal_x += mant_odd
														
 
															+    # take the bits!
														
 
															+    normal_x = normal_x >> (MBITS_F32 - mbits)
														
 
															+    normal_x = normal_x.to(torch.uint8)
														
 
															+
														
 
															+    #
														
 
															+    # combine the branches
														
 
															+    #
														
 
															+    x = torch.full_like(x, max_int, dtype=torch.uint8)
														
 
															+    x = torch.where(denormal_mask, denormal_x, x)
														
 
															+    x = torch.where(normal_mask, normal_x, x)
														
 
															+
														
 
															+    # add sign back
														
 
															+    sign_lp = sign >> (MBITS_F32 + EBITS_F32 - mbits - ebits)
														
 
															+    sign_lp = sign_lp.to(torch.uint8)
														
 
															+    # Right shift of a negative signed integer can fill the least significant
														
 
															+    # bits with either 1s or 0s, depending on the implementation. Since PyTorch
														
 
															+    # doesn't have an uint32 dtype, we mask out these bits to get just the
														
 
															+    # f4 sign bit
														
 
															+    sign_lp = sign_lp & sign_mask
														
 
															+    x = x | sign_lp
														
 
															+
														
 
															+    return x.to(torch.uint8)
														
 
															+
														
 
															+
														
 
															+# TODO(future): check if LUT for everything is faster than bit shifting,
														
 
															+# especially for fp4 (only 2^4=16 unique values).
														
 
															+def _fpx_unpacked_to_f32(x: Tensor, ebits: int, mbits: int) -> Tensor:
														
 
															+    """Convert sub-byte floating point numbers with the given number of exponent
														
 
															+    and mantissa bits to FP32.
														
 
															+    Input: torch.Tensor of dtype uint8, where the bit encoding is stored
														
 
															+    in the least significant bits. e.g.
														
 
															+      fp4: bits 0-3 empty and bits 4-7 in fp4_e2m1 encoding
														
 
															+      fp6: bits 0-1 empty and bits 2-7 in fp6_e2m3 or fp6_e3m2 encoding
														
 
															+    Output: torch.Tensor of dtype fp32 with the dequantized value
														
 
															+    """
														
 
															+    assert x.dtype == torch.uint8
														
 
															+    assert 1 + ebits + mbits <= 8
														
 
															+
														
 
															+    sign_mask = 1 << (ebits + mbits)
														
 
															+    exp_bias = _n_ones(ebits - 1)
														
 
															+    mantissa_mask = _n_ones(mbits)
														
 
															+
														
 
															+    # save the sign
														
 
															+    sign_lp = x & sign_mask
														
 
															+
														
 
															+    # set everything to positive, will add sign back at the end
														
 
															+    x_pos = x ^ sign_lp
														
 
															+
														
 
															+    #
														
 
															+    # 1. Calculate zero mask
														
 
															+    #
														
 
															+    zero_mask = x_pos == 0
														
 
															+
														
 
															+    #
														
 
															+    # 2. Calculate the denormal path mask
														
 
															+    #
														
 
															+    denormal_mask = torch.logical_and((x_pos > 0), ((x_pos >> mbits) == 0))
														
 
															+
														
 
															+    #
														
 
															+    # 3. Calculate the normal path
														
 
															+    #
														
 
															+
														
 
															+    # calculate the new exponent and shift it to bits 2:9 of the result
														
 
															+    exp_biased_lp = x_pos >> mbits
														
 
															+    exp_biased_f32 = exp_biased_lp - exp_bias + F32_EXP_BIAS
														
 
															+    exp_biased_f32 = exp_biased_f32.to(torch.int32) << MBITS_F32
														
 
															+
														
 
															+    # shift the mantissa to bits 10:32 of the result
														
 
															+    mantissa_lp_int32 = (x_pos & mantissa_mask).to(torch.int32)
														
 
															+    mantissa_f32 = mantissa_lp_int32 << (MBITS_F32 - mbits)
														
 
															+    result = exp_biased_f32 | mantissa_f32
														
 
															+
														
 
															+    #
														
 
															+    # 4. Add the zero and denormal casts to the already casted normal path
														
 
															+    #
														
 
															+    result[zero_mask] = 0
														
 
															+
														
 
															+    denormal_exp_biased = 1 - exp_bias + F32_EXP_BIAS
														
 
															+
														
 
															+    # fast path.
														
 
															+    # without this, performance for FP4_E2M1 is slower by 2x
														
 
															+    if mbits == 1:
														
 
															+        result[denormal_mask] = (denormal_exp_biased - mbits) << MBITS_F32
														
 
															+
														
 
															+    else:
														
 
															+        # iterate over all possible values of mantissa
														
 
															+        # i=0, j=1
														
 
															+        # i=1, j=10,11
														
 
															+        # i=2, j=100,101,110,111
														
 
															+        # and so on
														
 
															+        for i in range(mbits):
														
 
															+            for mantissa_cmp in range(1 << i, 1 << (i+1)):
														
 
															+                # left shift mantissa until it overflows (create an implicit 1)
														
 
															+                # subtract exponent by the same amount
														
 
															+                left_shift = mbits - i
														
 
															+                mantissa_f32 = (mantissa_cmp - (1 << i)) << (left_shift + MBITS_F32 - mbits)
														
 
															+                exp_biased_f32 = (denormal_exp_biased - left_shift) << MBITS_F32
														
 
															+
														
 
															+                # we can update this in-place since the values won't overlap
														
 
															+                # torch.compile() may complain unsupported operand type(s) for |: 'SymInt' and 'int'
														
 
															+                # thus we use + instead of | here
														
 
															+                mantissa_lp_int32[mantissa_lp_int32 == mantissa_cmp] = exp_biased_f32 + mantissa_f32
														
 
															+
														
 
															+        result = torch.where(denormal_mask, mantissa_lp_int32, result)
														
 
															+
														
 
															+    # add sign back
														
 
															+    sign_f32 = sign_lp.to(torch.int32) << (MBITS_F32 - mbits + EBITS_F32 - ebits)
														
 
															+    result = result | sign_f32
														
 
															+
														
 
															+    return result.view(torch.float)
														
 
															+
														
 
															+
														
 
															+def quant_llm_linear(
														
 
															+    EXPONENT: int,
														
 
															+    MANTISSA: int,
														
 
															+    _in_feats: Tensor,
														
 
															+    _weights: Tensor,
														
 
															+    _scales: Tensor,
														
 
															+    splitK: int = 1,
														
 
															+) -> Tensor:
														
 
															+    """
														
 
															+    Quant-LLM linear layer A @ W.T. See https://arxiv.org/abs/2401.14112 for more details.
														
 
															+    Arguments
														
 
															+        EXPONENT: number of exponent bits
														
 
															+        MANTISSA: number of mantissa bits
														
 
															+        _in_feats: input activations in FP16
														
 
															+        _weights: packed FPx weights
														
 
															+        _scales: scale
														
 
															+        splitK: split K
														
 
															+    Returns
														
 
															+        output of linear layer
														
 
															+    """
														
 
															+    return torch.ops.torchao.quant_llm_linear.default(EXPONENT, MANTISSA, _in_feats, _weights, _scales, splitK)
														
 
															+
														
 
															+
														
 
															+_ONES_TABLE = [_n_ones(i) for i in range(8)]
														
 
															+
														
 
															+
														
 
															+def _pack(x: Tensor, n_bits: int) -> Tensor:
														
 
															+    return reduce(torch.bitwise_or, [x[..., i::(8 // n_bits)] << (8 - (i + 1) * n_bits) for i in range(8 // n_bits)])
														
 
															+
														
 
															+
														
 
															+def _unpack(x: Tensor, n_bits: int) -> Tensor:
														
 
															+    return torch.stack([(x >> (8 - (i + 1) * n_bits)) & ((1 << n_bits) - 1) for i in range(8 // n_bits)], dim=-1).flatten(-2)
														
 
															+
														
 
															+
														
 
															+# https://github.com/usyd-fsalab/fp6_llm/blob/5df6737cca32f604e957e3f63f03ccc2e4d1df0d/fp6_llm/csrc/utils/weight_prepacking.h#L87-L116
														
 
															+def _bit_interleave(x: Tensor, n_bits: int, undo: bool = False) -> Tensor:
														
 
															+    # the original code unpacks/packs the values from/to uint32 while we unpack/pack the values from/to uint8
														
 
															+    # thus, we need to reverse byte order within a uint32 word.
														
 
															+    x = x.reshape(-1, 4).flip(1)
														
 
															+
														
 
															+    x = _unpack(x, n_bits)
														
 
															+    x = x.view(-1, 4 * (8 // n_bits))
														
 
															+
														
 
															+    if not undo:
														
 
															+        bit_order = {
														
 
															+            1: [1, 5, 9, 13, 17, 21, 25, 29, 3, 7, 11, 15, 19, 23, 27, 31,
														
 
															+                0, 4, 8, 12, 16, 20, 24, 28, 2, 6, 10, 14, 18, 22, 26, 30],
														
 
															+            2: [1, 5, 9, 13, 3, 7, 11, 15, 0, 4, 8, 12, 2, 6, 10, 14],
														
 
															+            4: [1, 5, 3, 7, 0, 4, 2, 6],
														
 
															+        }[n_bits]
														
 
															+
														
 
															+    else:
														
 
															+        # this is inverse of the above, obtained by running
														
 
															+        # [v.index(i) for i in range(len(v))]
														
 
															+        bit_order = {
														
 
															+            1: [16, 0, 24, 8, 17, 1, 25, 9, 18, 2, 26, 10, 19, 3, 27, 11,
														
 
															+                20, 4, 28, 12, 21, 5, 29, 13, 22, 6, 30, 14, 23, 7, 31, 15],
														
 
															+            2: [8, 0, 12, 4, 9, 1, 13, 5, 10, 2, 14, 6, 11, 3, 15, 7],
														
 
															+            4: [4, 0, 6, 2, 5, 1, 7, 3],
														
 
															+        }[n_bits]
														
 
															+
														
 
															+    x = x[:, bit_order]
														
 
															+    x = _pack(x, n_bits)
														
 
															+
														
 
															+    # reverse byte order within a uint32 word again.
														
 
															+    x = x.reshape(-1, 4).flip(1)
														
 
															+    return x.flatten()
														
 
															+
														
 
															+
														
 
															+# this is a literal adaptation of FP6-LLM ahead-of-time bit-level pre-packing
														
 
															+# https://github.com/usyd-fsalab/fp6_llm/blob/5df6737cca32f604e957e3f63f03ccc2e4d1df0d/fp6_llm/csrc/utils/weight_prepacking.h
														
 
															+def _pack_tc_fpx(tensor: Tensor, nbits: int) -> Tensor:
														
 
															+    assert tensor.ndim == 2, tensor.dtype == torch.uint8
														
 
															+    M, N = tensor.shape
														
 
															+    assert (M % 64 == 0) and (N % 64 == 0)
														
 
															+
														
 
															+    # Pass 1 from original code
														
 
															+    tensor = tensor.view(M // 64, 4, 2, 8, N // 16, 2, 8)
														
 
															+    tensor = tensor.permute(0, 4, 1, 5, 2, 3, 6)
														
 
															+    tensor = tensor.reshape(-1, 32, 2)
														
 
															+    tensor = tensor.permute(1, 0, 2)
														
 
															+    tensor = tensor.flatten()
														
 
															+
														
 
															+    used_bits = 0
														
 
															+    fragments = []
														
 
															+
														
 
															+    for y in [1, 2, 4]:
														
 
															+        if nbits & y:
														
 
															+            mask = (1 << y) - 1
														
 
															+            tensor_ybit = (tensor >> (nbits - used_bits - y)) & mask
														
 
															+            tensor_ybit = _pack(tensor_ybit, y)
														
 
															+
														
 
															+            tensor_ybit = tensor_ybit.view(32, -1, 4).permute(1, 0, 2).flip(2)
														
 
															+            tensor_ybit = _bit_interleave(tensor_ybit.flatten(), y)
														
 
															+            fragments.append(tensor_ybit)
														
 
															+            used_bits += y
														
 
															+
														
 
															+    return torch.cat(fragments, dim=0).view(M, -1)
														
 
															+
														
 
															+
														
 
															+# more optimized version of _pack_tc_fpx() for FP6 by merging ops
														
 
															+def _pack_tc_fp6(tensor: Tensor) -> Tensor:
														
 
															+    assert tensor.ndim == 2, tensor.dtype == torch.uint8
														
 
															+    M, N = tensor.shape
														
 
															+    assert (M % 64 == 0) and (N % 64 == 0)
														
 
															+
														
 
															+    tensor = tensor.view(M // 64, 2, 2, 2, 8, N // 16, 2, 8)
														
 
															+    tensor = tensor.flip(3)
														
 
															+
														
 
															+    tensor_2bit = (tensor >> 4) & 0b11
														
 
															+    tensor_2bit = tensor_2bit.permute(0, 5, 1, 4, 7, 3, 2, 6)
														
 
															+    tensor_2bit = _pack(tensor_2bit.flatten(), 2)
														
 
															+
														
 
															+    tensor_4bit = tensor & 0b1111
														
 
															+    tensor_4bit = tensor_4bit.permute(0, 5, 1, 2, 4, 7, 3, 6)
														
 
															+    tensor_4bit = _pack(tensor_4bit.flatten(), 4)
														
 
															+
														
 
															+    return torch.cat([tensor_2bit, tensor_4bit], dim=0).view(M, -1)
														
 
															+
														
 
															+
														
 
															+# currently only optimize for TC-FP6 packing
														
 
															+def pack_tc_fpx(tensor: Tensor, nbits: int) -> Tensor:
														
 
															+    if nbits == 6:
														
 
															+        return _pack_tc_fp6(tensor)
														
 
															+    return _pack_tc_fpx(tensor, nbits)
														
 
															+
														
 
															+
														
 
															+def to_scaled_tc_fpx(tensor: Tensor, ebits: int, mbits: int) -> Tuple[Tensor, Tensor]:
														
 
															+    # _n_ones() is not compatible with torch.compile() due to << operator
														
 
															+    # https://github.com/pytorch/pytorch/issues/119152
														
 
															+    # exp_bias = _n_ones(ebits - 1)
														
 
															+    # max_normal = 2 ** (_n_ones(ebits) - exp_bias) * (_n_ones(mbits + 1) / (2 ** mbits))
														
 
															+
														
 
															+    # workaround: global lookup table
														
 
															+    exp_bias = _ONES_TABLE[ebits - 1]
														
 
															+    max_normal = 2 ** (_ONES_TABLE[ebits] - exp_bias) * (_ONES_TABLE[mbits + 1] / (2 ** mbits))
														
 
															+
														
 
															+    tensor = tensor.float()
														
 
															+    scale = tensor.abs().amax(1).clamp(min=1e-12) / max_normal
														
 
															+    tensor_fpx = _f32_to_fpx_unpacked(tensor / scale.view(-1, 1), ebits, mbits)
														
 
															+    tensor_tc_fpx = pack_tc_fpx(tensor_fpx, 1 + ebits + mbits)
														
 
															+    return tensor_tc_fpx, scale.half()
														
 
															+
														
 
															+
														
 
															+# inverse of _pack_tc_fpx()
														
 
															+def _unpack_tc_fpx(tensor: Tensor, nbits: int) -> Tensor:
														
 
															+    assert tensor.ndim == 2 and tensor.dtype == torch.uint8
														
 
															+    M = tensor.shape[0]
														
 
															+    size = tensor.numel()
														
 
															+    tensor = tensor.flatten()
														
 
															+    offset = 0
														
 
															+    used_bits = 0
														
 
															+
														
 
															+    tensor_fpx = None
														
 
															+
														
 
															+    for y in [1, 2, 4]:
														
 
															+        if nbits & y:
														
 
															+            size_ybit = size // nbits * y
														
 
															+            tensor_ybit = tensor[offset : offset + size_ybit]
														
 
															+            offset += size_ybit
														
 
															+
														
 
															+            tensor_ybit = _bit_interleave(tensor_ybit, y, undo=True)            # undo Pass 3
														
 
															+            tensor_ybit = tensor_ybit.view(-1, 32, 4).flip(2).permute(1, 0, 2)  # undo Pass 2
														
 
															+
														
 
															+            tensor_ybit = _unpack(tensor_ybit.flatten(), y)
														
 
															+            tensor_ybit = tensor_ybit << (nbits - used_bits - y)
														
 
															+            used_bits += y
														
 
															+
														
 
															+            if tensor_fpx is None:
														
 
															+                tensor_fpx = tensor_ybit
														
 
															+            else:
														
 
															+                tensor_fpx |= tensor_ybit
														
 
															+
														
 
															+    # undo Pass 1
														
 
															+    tensor_fpx = tensor_fpx.view(32, -1, 2).permute(1, 0, 2)
														
 
															+    tensor_fpx = tensor_fpx.reshape(M // 64, -1, 4, 2, 2, 8, 8)
														
 
															+    tensor_fpx = tensor_fpx.permute(0, 2, 4, 5, 1, 3, 6)
														
 
															+    tensor_fpx = tensor_fpx.reshape(M, -1)
														
 
															+    return tensor_fpx
														
 
															+
														
 
															+
														
 
															+# more optimized version of _unpack_tc_fpx() for FP6 by merging ops
														
 
															+# inverse of _unpack_tc_fp6()
														
 
															+def _unpack_tc_fp6(tensor: Tensor) -> Tensor:
														
 
															+    assert tensor.ndim == 2 and tensor.dtype == torch.uint8
														
 
															+    M = tensor.shape[0]
														
 
															+    N = tensor.shape[1] // 3 * 4
														
 
															+    assert (M % 64 == 0) and (N % 64 == 0)
														
 
															+    size_2bit = M * N // 4
														
 
															+    size_4bit = M * N // 2
														
 
															+    tensor = tensor.view(-1)
														
 
															+    assert tensor.numel() == size_2bit + size_4bit
														
 
															+
														
 
															+    tensor_2bit, tensor_4bit = tensor.split([size_2bit, size_4bit])
														
 
															+
														
 
															+    tensor_2bit = _unpack(tensor_2bit, 2)
														
 
															+    tensor_2bit = tensor_2bit.view(M // 64, N // 16, 2, 8, 8, 2, 2, 2)
														
 
															+    tensor_2bit = tensor_2bit.permute(0, 2, 6, 5, 3, 1, 7, 4)
														
 
															+
														
 
															+    tensor_4bit = _unpack(tensor_4bit, 4)
														
 
															+    tensor_4bit = tensor_4bit.view(M // 64, N // 16, 2, 2, 8, 8, 2, 2)
														
 
															+    tensor_4bit = tensor_4bit.permute(0, 2, 3, 6, 4, 1, 7, 5)
														
 
															+
														
 
															+    tensor_fp6 = (tensor_2bit << 4) | tensor_4bit
														
 
															+    tensor_fp6 = tensor_fp6.flip(3).reshape(M, N)
														
 
															+    return tensor_fp6
														
 
															+
														
 
															+
														
 
															+def unpack_tc_fpx(tensor: Tensor, nbits: int) -> Tensor:
														
 
															+    if nbits == 6:
														
 
															+        return _unpack_tc_fp6(tensor)
														
 
															+    return _unpack_tc_fpx(tensor, nbits)
														
 
															+
														
 
															+
														
 
															+def from_scaled_tc_fpx(tensor: Tensor, ebits: int, mbits: int, scale=None) -> Tensor:
														
 
															+    fpx_unpacked = unpack_tc_fpx(tensor, 1 + ebits + mbits)
														
 
															+    tensor = _fpx_unpacked_to_f32(fpx_unpacked, ebits, mbits)
														
 
															+    if scale is not None:
														
 
															+        tensor = tensor * scale.float().view(-1, 1)
														
 
															+    return tensor
														
--- a/kernels/quantization/fp6/configs.h
+++ b/kernels/quantization/fp6/configs.h
@@ -0,0 +1,73 @@
 
															+//    Copyright 2024 FP6-LLM authors
														
 
															+//
														
 
															+//    Licensed under the Apache License, Version 2.0 (the "License");
														
 
															+//    you may not use this file except in compliance with the License.
														
 
															+//    You may obtain a copy of the License at
														
 
															+//
														
 
															+//        http://www.apache.org/licenses/LICENSE-2.0
														
 
															+//
														
 
															+//    Unless required by applicable law or agreed to in writing, software
														
 
															+//    distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+//    See the License for the specific language governing permissions and
														
 
															+//    limitations under the License.
														
 
															+//
														
 
															+// This file is copied from
														
 
															+// https://github.com/usyd-fsalab/fp6_llm/blob/5df6737cca32f604e957e3f63f03ccc2e4d1df0d/fp6_llm/csrc/include/configs.h
														
 
															+
														
 
															+#ifndef CONFIGS_H
														
 
															+#define CONFIGS_H
														
 
															+
														
 
															+// #define DEBUG_MODE
														
 
															+#define PIPELINE_LEVEL_GMEM 2
														
 
															+#define PIPELINE_LEVEL_SMEM 2  // only support 2
														
 
															+
														
 
															+/************************ Hardware Parameters ************************/
														
 
															+#define WARP_SIZE 32
														
 
															+#define REG_BIT_WIDTH 32
														
 
															+// mma: M=16 K=16 N=8
														
 
															+#define MMA_8 8
														
 
															+#define MMA_16 16
														
 
															+// for memory access
														
 
															+#define THREAD_OPT_ACCESS_BIT_WIDTH_128 128  // LDS.128, cp_async.128, ...
														
 
															+#define BIT_WIDTH_PER_HALF 16                // Half precision: FP16
														
 
															+
														
 
															+/******************** Register Allocation For GEMM ********************/
														
 
															+#define REG_PER_THREAD_C_TENSOR_16_16 8  // 8 for FP32 Accumulation
														
 
															+/********************** Memory Padding Parameters **********************/
														
 
															+// Eliminating bank-conflict
														
 
															+#define PADDING_BYTES_16 16  // Padding 16 bytes each column
														
 
															+#define PADDING_SHARED_MEM_FOR_B_8 \
														
 
															+  8  // Padding 8 half  each column, during CopyFromGlobalToShared() for B
														
 
															+#define PADDING_SHARED_MEM_FOR_C_4 \
														
 
															+  4  // Padding 4 float each column, during StoreToSharedMemoryFromRegister()
														
 
															+     // for C
														
 
															+/************************* WARP Tiling part-1 *************************/
														
 
															+#define WARP_ROW_MMA_TENSORS 4
														
 
															+#define WARP_M (WARP_ROW_MMA_TENSORS * MMA_16)  // 64
														
 
															+#define WARP_K_MMA_TENSORS 4
														
 
															+#define WARP_K (WARP_K_MMA_TENSORS * MMA_16)  // 64
														
 
															+template <int BLOCK_ROW_WARPS_, int BLOCK_COL_WARPS_, int WARP_COL_MMA_TENSORS_>
														
 
															+struct TilingConfig {
														
 
															+  // Depending on "n" dimension of the GEMM
														
 
															+  static constexpr int BLOCK_ROW_WARPS = BLOCK_ROW_WARPS_;
														
 
															+  static constexpr int BLOCK_COL_WARPS = BLOCK_COL_WARPS_;
														
 
															+  static constexpr int WARP_COL_MMA_TENSORS = WARP_COL_MMA_TENSORS_;
														
 
															+  /************************* WARP Tiling part-2 *************************/
														
 
															+  static constexpr int WARP_N = WARP_COL_MMA_TENSORS * MMA_8;
														
 
															+  /*************************Thread Block Tiling *************************/
														
 
															+  static constexpr int TILE_M = WARP_M * BLOCK_ROW_WARPS;
														
 
															+  static constexpr int TILE_N = MMA_8 * WARP_COL_MMA_TENSORS * BLOCK_COL_WARPS;
														
 
															+  static constexpr int TILE_K = WARP_K;
														
 
															+  /********************** #Thread per Thread Block **********************/
														
 
															+  static constexpr int BLOCK_WARPS = BLOCK_ROW_WARPS * BLOCK_COL_WARPS;
														
 
															+  static constexpr int BLOCK_THREADS = BLOCK_WARPS * WARP_SIZE;
														
 
															+  /******************************* Others *******************************/
														
 
															+  static constexpr int SMEM_SIZE_B_TILE =
														
 
															+      TILE_N * (TILE_K + PADDING_BYTES_16) * 2 *
														
 
															+      PIPELINE_LEVEL_GMEM;  // sizeof(half)=2, doubleBuffer=2
														
 
															+  static constexpr int SMEM_SIZE_C_TILE =
														
 
															+      TILE_N * (TILE_M + PADDING_BYTES_16) * 4;  // sizeof(float)=4
														
 
															+};
														
 
															+
														
 
															+#endif  // CONFIGS_H
														
--- a/kernels/quantization/fp6/fp6_linear.cu
+++ b/kernels/quantization/fp6/fp6_linear.cu
@@ -0,0 +1,332 @@
 
															+//    Copyright 2024 FP6-LLM authors
														
 
															+//
														
 
															+//    Licensed under the Apache License, Version 2.0 (the "License");
														
 
															+//    you may not use this file except in compliance with the License.
														
 
															+//    You may obtain a copy of the License at
														
 
															+//
														
 
															+//        http://www.apache.org/licenses/LICENSE-2.0
														
 
															+//
														
 
															+//    Unless required by applicable law or agreed to in writing, software
														
 
															+//    distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+//    See the License for the specific language governing permissions and
														
 
															+//    limitations under the License.
														
 
															+//
														
 
															+// This file is adapted from
														
 
															+// https://github.com/usyd-fsalab/fp6_llm/blob/5df6737cca32f604e957e3f63f03ccc2e4d1df0d/fp6_llm/csrc/fp6_linear.cu
														
 
															+
														
 
															+#include "kernel_matmul.cuh"
														
 
															+#include "kernel_reduction.cuh"
														
 
															+
														
 
															+#include <stdio.h>
														
 
															+#include <assert.h>
														
 
															+
														
 
															+namespace aphrodite {
														
 
															+
														
 
															+template <typename TilingConfig, typename OutputDataType, int EXPONENT,
														
 
															+          int MANTISSA>
														
 
															+static void Kernel_Ex(cudaStream_t stream, const uint4* Weight,
														
 
															+                      const half* Scales, const half* B, OutputDataType* C,
														
 
															+                      const size_t M_Global, const size_t N_Global,
														
 
															+                      const size_t K_Global, int Split_K) {
														
 
															+#ifdef DEBUG_MODE
														
 
															+  printf("\n");
														
 
															+  printf("Launcher.cu->Kernel_Ex():\n");
														
 
															+  printf("M: %d, N: %d, K: %d, SplitK: %d\n", M_Global, N_Global, K_Global,
														
 
															+         Split_K);
														
 
															+  printf("TILE_M: %d, TILE_K: %d, TILE_N: %d\n", TilingConfig::TILE_M,
														
 
															+         TilingConfig::TILE_K, TilingConfig::TILE_N);
														
 
															+#endif
														
 
															+  static size_t SHMEM_SZ =
														
 
															+      max(TilingConfig::SMEM_SIZE_B_TILE + SMEM_SIZE_PER_TB_A_TILE,
														
 
															+          TilingConfig::SMEM_SIZE_C_TILE);
														
 
															+  cudaFuncSetAttribute(
														
 
															+      QUANT_GEMM_Kernel<TilingConfig, OutputDataType, EXPONENT, MANTISSA>,
														
 
															+      cudaFuncAttributeMaxDynamicSharedMemorySize, SHMEM_SZ);
														
 
															+  size_t dimN = (N_Global - 1) / TilingConfig::TILE_N + 1;
														
 
															+  size_t dimM = M_Global * Split_K / TilingConfig::TILE_M;
														
 
															+  dim3 GridDim(dimN, dimM, 1);
														
 
															+  dim3 BlockDim(WARP_SIZE * TilingConfig::BLOCK_WARPS, 1, 1);
														
 
															+//
														
 
															+#ifdef DEBUG_MODE
														
 
															+  printf(
														
 
															+      "GridDim.x: %d, GridDim.y: %d, GridDim.z: %d, BlockDim.x: %d, "
														
 
															+      "BlockDim.y: %d, BlockDim.z: %d SHMEM_SZ: %d\n",
														
 
															+      GridDim.x, GridDim.y, GridDim.z, BlockDim.x, BlockDim.y, BlockDim.z,
														
 
															+      SHMEM_SZ);
														
 
															+  printf("\n");
														
 
															+#endif
														
 
															+  QUANT_GEMM_Kernel<TilingConfig, OutputDataType, EXPONENT, MANTISSA>
														
 
															+      <<<GridDim, BlockDim, SHMEM_SZ, stream>>>(Weight, Scales, B, C, M_Global,
														
 
															+                                                N_Global, K_Global, Split_K);
														
 
															+}
														
 
															+
														
 
															+template <int EXPONENT, int MANTISSA>
														
 
															+cudaError_t fpx_linear_kernel(
														
 
															+    cudaStream_t stream, const uint4* Weight, const half* Scales, const half* B,
														
 
															+    half* C, const size_t M_Global, const size_t N_Global,
														
 
															+    const size_t K_Global,
														
 
															+    float* Reduction_Workspace,  // Reduction_Workspace_Size = Split_K *
														
 
															+                                 // M_Global * N_Global * sizeof(fp32)
														
 
															+    int Split_K) {
														
 
															+  assert(M_Global % 256 == 0);
														
 
															+  assert(K_Global % 64 == 0);
														
 
															+  assert(N_Global > 0);
														
 
															+
														
 
															+  // Work around to support more N shapes:
														
 
															+  size_t N_PowerOf2;
														
 
															+  if (N_Global > 0 && N_Global <= 8) N_PowerOf2 = 8;
														
 
															+  if (N_Global > 8 && N_Global <= 16) N_PowerOf2 = 16;
														
 
															+  if (N_Global > 16 && N_Global <= 32) N_PowerOf2 = 32;
														
 
															+  if (N_Global > 32 && N_Global <= 64) N_PowerOf2 = 64;
														
 
															+  if (N_Global > 64 && N_Global <= 128) N_PowerOf2 = 128;
														
 
															+  if (N_Global > 128) N_PowerOf2 = ((N_Global - 1) / 128 + 1) * 128;
														
 
															+
														
 
															+  if (Split_K == 1) {
														
 
															+    switch (N_PowerOf2) {
														
 
															+      case 8:
														
 
															+        Kernel_Ex<TilingConfig<4, 1, 1>, half, EXPONENT, MANTISSA>(
														
 
															+            stream, Weight, Scales, B, C, M_Global, N_Global, K_Global,
														
 
															+            Split_K);
														
 
															+        break;
														
 
															+      case 16:
														
 
															+        Kernel_Ex<TilingConfig<4, 1, 2>, half, EXPONENT, MANTISSA>(
														
 
															+            stream, Weight, Scales, B, C, M_Global, N_Global, K_Global,
														
 
															+            Split_K);
														
 
															+        break;
														
 
															+      case 32:
														
 
															+        Kernel_Ex<TilingConfig<4, 1, 4>, half, EXPONENT, MANTISSA>(
														
 
															+            stream, Weight, Scales, B, C, M_Global, N_Global, K_Global,
														
 
															+            Split_K);
														
 
															+        break;
														
 
															+      case 64:
														
 
															+        Kernel_Ex<TilingConfig<4, 1, 8>, half, EXPONENT, MANTISSA>(
														
 
															+            stream, Weight, Scales, B, C, M_Global, N_Global, K_Global,
														
 
															+            Split_K);
														
 
															+        break;
														
 
															+      case 128:
														
 
															+        Kernel_Ex<TilingConfig<4, 1, 8>, half, EXPONENT, MANTISSA>(
														
 
															+            stream, Weight, Scales, B, C, M_Global, N_Global, K_Global,
														
 
															+            Split_K);
														
 
															+        break;
														
 
															+      default:
														
 
															+        if (N_PowerOf2 % 128 != 0) {
														
 
															+          printf("FP6LLM_API Error: Unsupported N dimension %d!\n", N_PowerOf2);
														
 
															+          return cudaErrorUnknown;
														
 
															+        }
														
 
															+        Kernel_Ex<TilingConfig<4, 1, 8>, half, EXPONENT, MANTISSA>(
														
 
															+            stream, Weight, Scales, B, C, M_Global, N_Global, K_Global,
														
 
															+            Split_K);
														
 
															+        break;
														
 
															+    }
														
 
															+  } else {
														
 
															+    switch (N_PowerOf2) {
														
 
															+      case 8:
														
 
															+        Kernel_Ex<TilingConfig<4, 1, 1>, float, EXPONENT, MANTISSA>(
														
 
															+            stream, Weight, Scales, B, Reduction_Workspace, M_Global, N_Global,
														
 
															+            K_Global, Split_K);
														
 
															+        break;
														
 
															+      case 16:
														
 
															+        Kernel_Ex<TilingConfig<4, 1, 2>, float, EXPONENT, MANTISSA>(
														
 
															+            stream, Weight, Scales, B, Reduction_Workspace, M_Global, N_Global,
														
 
															+            K_Global, Split_K);
														
 
															+        break;
														
 
															+      case 32:
														
 
															+        Kernel_Ex<TilingConfig<4, 1, 4>, float, EXPONENT, MANTISSA>(
														
 
															+            stream, Weight, Scales, B, Reduction_Workspace, M_Global, N_Global,
														
 
															+            K_Global, Split_K);
														
 
															+        break;
														
 
															+      case 64:
														
 
															+        Kernel_Ex<TilingConfig<4, 1, 8>, float, EXPONENT, MANTISSA>(
														
 
															+            stream, Weight, Scales, B, Reduction_Workspace, M_Global, N_Global,
														
 
															+            K_Global, Split_K);
														
 
															+        break;
														
 
															+      case 128:
														
 
															+        Kernel_Ex<TilingConfig<4, 1, 8>, float, EXPONENT, MANTISSA>(
														
 
															+            stream, Weight, Scales, B, Reduction_Workspace, M_Global, N_Global,
														
 
															+            K_Global, Split_K);
														
 
															+        break;
														
 
															+      default:
														
 
															+        if (N_PowerOf2 % 128 != 0) {
														
 
															+          printf("FP6LLM_API Error: Unsupported N dimension %d!\n", N_PowerOf2);
														
 
															+          return cudaErrorUnknown;
														
 
															+        }
														
 
															+        Kernel_Ex<TilingConfig<4, 1, 8>, float, EXPONENT, MANTISSA>(
														
 
															+            stream, Weight, Scales, B, Reduction_Workspace, M_Global, N_Global,
														
 
															+            K_Global, Split_K);
														
 
															+        break;
														
 
															+    }
														
 
															+    // Reduction for SplitK
														
 
															+    dim3 GridDim((M_Global * N_Global) / REDUCTION_ELEMENT_PER_THREADBLOCK, 1,
														
 
															+                 1);
														
 
															+    dim3 BlockDim(WARP_SIZE, 1, 1);
														
 
															+    SplitK_Reduction<<<GridDim, BlockDim, 0, stream>>>(
														
 
															+        C, Reduction_Workspace, M_Global, N_Global, Split_K);
														
 
															+  }
														
 
															+  return cudaGetLastError();
														
 
															+}
														
 
															+}  // namespace aphrodite
														
 
															+
														
 
															+#include <torch/all.h>
														
 
															+#include <ATen/ATen.h>
														
 
															+#include <ATen/cuda/CUDAContext.h>
														
 
															+#include <torch/library.h>
														
 
															+
														
 
															+// MODIFICATION NOTE: dtype of _weights is changed to uint8
														
 
															+/*
														
 
															+Computes FPx-FP16 GEMM (PyTorch interface).
														
 
															+[Mathematical Formula]
														
 
															+Standard definition of linear layer:    Out = In * trans(W), where In, Out, and
														
 
															+W are stored in row-major. After Equivalent transformation    :    trans(Out) =
														
 
															+W * trans(In). Note that we do not perform "transpose" during runtime, we
														
 
															+instead interpret the In/Out as column-major matrices when calling our CUDA
														
 
															+kernel. [Inputs] _in_feats:  tensor of shape [B, IC];                  // half
														
 
															+  _weights:   int tensor of shape [OC, IC // 8 * x];    // x UINT8 words
														
 
															+contains 8 FPx weights. _scales:    tensor of shape [OC];                     //
														
 
															+half splitK:     splitting the MatMul problem along K dimension for higher GPU
														
 
															+utilization, default 1. [Outputs] _out_feats: tensor of shape [B, OC]; // half
														
 
															+*/
														
 
															+torch::Tensor fp_eXmY_linear_forward_cuda(int64_t EXPONENT, int64_t MANTISSA,
														
 
															+                                          torch::Tensor _in_feats,
														
 
															+                                          torch::Tensor _weights,
														
 
															+                                          torch::Tensor _scales,
														
 
															+                                          int64_t splitK = 1) {
														
 
															+  const int64_t NBITS = 1 + EXPONENT + MANTISSA;
														
 
															+  int num_in_feats = _in_feats.size(0);
														
 
															+  int num_in_channels = _in_feats.size(1);
														
 
															+  int num_out_channels = _weights.size(0);
														
 
															+  TORCH_CHECK(num_in_channels % 64 == 0,
														
 
															+              "Expected in_features to be a multiple of 64, but received ",
														
 
															+              num_in_channels);
														
 
															+  TORCH_CHECK((num_in_channels / 8 * NBITS) ==
														
 
															+              _weights.size(1));  // Making sure the K dimension is matched.
														
 
															+  //
														
 
															+  int M = num_out_channels;
														
 
															+  int K = num_in_channels;
														
 
															+  int N = num_in_feats;
														
 
															+  // Input Tensors
														
 
															+  auto weight = reinterpret_cast<const uint4*>(
														
 
															+      _weights.data_ptr<uint8_t>());  // weights is [OC, IC] but in FP6.
														
 
															+  auto in_feats = reinterpret_cast<const half*>(_in_feats.data_ptr<at::Half>());
														
 
															+  auto scales = reinterpret_cast<const half*>(_scales.data_ptr<at::Half>());
														
 
															+  // Output Tensors
														
 
															+  auto options = torch::TensorOptions()
														
 
															+                     .dtype(_in_feats.dtype())
														
 
															+                     .device(_in_feats.device());
														
 
															+  at::Tensor _out_feats =
														
 
															+      torch::empty({num_in_feats, num_out_channels}, options);
														
 
															+  auto out_feats = reinterpret_cast<half*>(_out_feats.data_ptr<at::Half>());
														
 
															+
														
 
															+  options =
														
 
															+      torch::TensorOptions().dtype(torch::kFloat32).device(_in_feats.device());
														
 
															+  at::Tensor _workspace =
														
 
															+      torch::empty({splitK, num_in_feats, num_out_channels}, options);
														
 
															+  auto Reduction_Workspace = reinterpret_cast<float*>(
														
 
															+      _workspace.data_ptr<float>());  // Reduction_Workspace_Size = Split_K *
														
 
															+                                      // M_Global * N_Global * sizeof(fp32)
														
 
															+
														
 
															+  // MODIFICATION NOTE: use at::cuda::getCurrentCUDAStream() instead of default
														
 
															+  // stream (0) this fixes problem with CUDA graphs when used with
														
 
															+  // torch.compile()
														
 
															+  auto stream = at::cuda::getCurrentCUDAStream();
														
 
															+
														
 
															+  /*
														
 
															+   The heuristic is weight_bit - exponent_bit - 1 = mantissa_bit
														
 
															+   */
														
 
															+
														
 
															+  // FP2
														
 
															+  if (EXPONENT == 1 && MANTISSA == 0)
														
 
															+    aphrodite::fpx_linear_kernel<1, 0>(stream, weight, scales, in_feats,
														
 
															+                                       out_feats, M, N, K, Reduction_Workspace,
														
 
															+                                       splitK);
														
 
															+
														
 
															+  // FP3
														
 
															+  else if (EXPONENT == 1 && MANTISSA == 1)
														
 
															+    aphrodite::fpx_linear_kernel<1, 1>(stream, weight, scales, in_feats,
														
 
															+                                       out_feats, M, N, K, Reduction_Workspace,
														
 
															+                                       splitK);
														
 
															+  else if (EXPONENT == 2 && MANTISSA == 0)
														
 
															+    aphrodite::fpx_linear_kernel<2, 0>(stream, weight, scales, in_feats,
														
 
															+                                       out_feats, M, N, K, Reduction_Workspace,
														
 
															+                                       splitK);
														
 
															+
														
 
															+  // FP4
														
 
															+  else if (EXPONENT == 1 && MANTISSA == 2)
														
 
															+    aphrodite::fpx_linear_kernel<1, 2>(stream, weight, scales, in_feats,
														
 
															+                                       out_feats, M, N, K, Reduction_Workspace,
														
 
															+                                       splitK);
														
 
															+  else if (EXPONENT == 3 && MANTISSA == 0)
														
 
															+    aphrodite::fpx_linear_kernel<3, 0>(stream, weight, scales, in_feats,
														
 
															+                                       out_feats, M, N, K, Reduction_Workspace,
														
 
															+                                       splitK);
														
 
															+  else if (EXPONENT == 2 && MANTISSA == 1)
														
 
															+    aphrodite::fpx_linear_kernel<2, 1>(stream, weight, scales, in_feats,
														
 
															+                                       out_feats, M, N, K, Reduction_Workspace,
														
 
															+                                       splitK);
														
 
															+  // FP5
														
 
															+  else if (EXPONENT == 1 && MANTISSA == 3)
														
 
															+    aphrodite::fpx_linear_kernel<1, 3>(stream, weight, scales, in_feats,
														
 
															+                                       out_feats, M, N, K, Reduction_Workspace,
														
 
															+                                       splitK);
														
 
															+  else if (EXPONENT == 2 && MANTISSA == 2)
														
 
															+    aphrodite::fpx_linear_kernel<2, 2>(stream, weight, scales, in_feats,
														
 
															+                                       out_feats, M, N, K, Reduction_Workspace,
														
 
															+                                       splitK);
														
 
															+  else if (EXPONENT == 3 && MANTISSA == 1)
														
 
															+    aphrodite::fpx_linear_kernel<3, 1>(stream, weight, scales, in_feats,
														
 
															+                                       out_feats, M, N, K, Reduction_Workspace,
														
 
															+                                       splitK);
														
 
															+  else if (EXPONENT == 4 && MANTISSA == 0)
														
 
															+    aphrodite::fpx_linear_kernel<4, 0>(stream, weight, scales, in_feats,
														
 
															+                                       out_feats, M, N, K, Reduction_Workspace,
														
 
															+                                       splitK);
														
 
															+
														
 
															+  // FP6
														
 
															+  else if (EXPONENT == 1 && MANTISSA == 4)
														
 
															+    aphrodite::fpx_linear_kernel<1, 4>(stream, weight, scales, in_feats,
														
 
															+                                       out_feats, M, N, K, Reduction_Workspace,
														
 
															+                                       splitK);
														
 
															+  else if (EXPONENT == 2 && MANTISSA == 3)
														
 
															+    aphrodite::fpx_linear_kernel<2, 3>(stream, weight, scales, in_feats,
														
 
															+                                       out_feats, M, N, K, Reduction_Workspace,
														
 
															+                                       splitK);
														
 
															+  else if (EXPONENT == 3 && MANTISSA == 2)
														
 
															+    aphrodite::fpx_linear_kernel<3, 2>(stream, weight, scales, in_feats,
														
 
															+                                       out_feats, M, N, K, Reduction_Workspace,
														
 
															+                                       splitK);
														
 
															+  else if (EXPONENT == 4 && MANTISSA == 1)
														
 
															+    aphrodite::fpx_linear_kernel<4, 1>(stream, weight, scales, in_feats,
														
 
															+                                       out_feats, M, N, K, Reduction_Workspace,
														
 
															+                                       splitK);
														
 
															+  else if (EXPONENT == 5 && MANTISSA == 0)
														
 
															+    aphrodite::fpx_linear_kernel<5, 0>(stream, weight, scales, in_feats,
														
 
															+                                       out_feats, M, N, K, Reduction_Workspace,
														
 
															+                                       splitK);
														
 
															+  // FP7
														
 
															+  else if (EXPONENT == 1 && MANTISSA == 5)
														
 
															+    aphrodite::fpx_linear_kernel<1, 5>(stream, weight, scales, in_feats,
														
 
															+                                       out_feats, M, N, K, Reduction_Workspace,
														
 
															+                                       splitK);
														
 
															+  else if (EXPONENT == 2 && MANTISSA == 4)
														
 
															+    aphrodite::fpx_linear_kernel<2, 4>(stream, weight, scales, in_feats,
														
 
															+                                       out_feats, M, N, K, Reduction_Workspace,
														
 
															+                                       splitK);
														
 
															+  else if (EXPONENT == 3 && MANTISSA == 3)
														
 
															+    aphrodite::fpx_linear_kernel<3, 3>(stream, weight, scales, in_feats,
														
 
															+                                       out_feats, M, N, K, Reduction_Workspace,
														
 
															+                                       splitK);
														
 
															+  else if (EXPONENT == 4 && MANTISSA == 2)
														
 
															+    aphrodite::fpx_linear_kernel<4, 2>(stream, weight, scales, in_feats,
														
 
															+                                       out_feats, M, N, K, Reduction_Workspace,
														
 
															+                                       splitK);
														
 
															+  else if (EXPONENT == 5 && MANTISSA == 1)
														
 
															+    aphrodite::fpx_linear_kernel<5, 1>(stream, weight, scales, in_feats,
														
 
															+                                       out_feats, M, N, K, Reduction_Workspace,
														
 
															+                                       splitK);
														
 
															+
														
 
															+  else
														
 
															+    TORCH_CHECK(false, "FP", NBITS, " E", EXPONENT, "M", MANTISSA,
														
 
															+                " is not supported.");
														
 
															+
														
 
															+  return _out_feats;
														
 
															+}
														
--- a/kernels/quantization/fp6/kernel_matmul.cuh
+++ b/kernels/quantization/fp6/kernel_matmul.cuh
@@ -0,0 +1,354 @@
 
															+//    Copyright 2024 FP6-LLM authors
														
 
															+//
														
 
															+//    Licensed under the Apache License, Version 2.0 (the "License");
														
 
															+//    you may not use this file except in compliance with the License.
														
 
															+//    You may obtain a copy of the License at
														
 
															+//
														
 
															+//        http://www.apache.org/licenses/LICENSE-2.0
														
 
															+//
														
 
															+//    Unless required by applicable law or agreed to in writing, software
														
 
															+//    distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+//    See the License for the specific language governing permissions and
														
 
															+//    limitations under the License.
														
 
															+//
														
 
															+// This file is modified from
														
 
															+// https://github.com/usyd-fsalab/fp6_llm/blob/5df6737cca32f604e957e3f63f03ccc2e4d1df0d/fp6_llm/csrc/include/kernel_matmul.cuh
														
 
															+
														
 
															+#include "configs.h"
														
 
															+#include "utils_gmem.cuh"
														
 
															+#include "utils_core.cuh"
														
 
															+
														
 
															+/************************** Bitwidth of Weight Segments
														
 
															+ * ************************/
														
 
															+#define BIT_WIDTH_1 1
														
 
															+#define BIT_WIDTH_2 2
														
 
															+#define BIT_WIDTH_4 4
														
 
															+/*************************** 64*64 Weghts of Weight Matrix
														
 
															+ * *********************/
														
 
															+#define WEIGHT_PER_WARP (WARP_M * WARP_K)  // 64*64 = 4096
														
 
															+#define SMEM_SIZE_PER_WARP_1BIT    \
														
 
															+  (WEIGHT_PER_WARP * BIT_WIDTH_1 / \
														
 
															+   8)  // 512 Bytes,  doubleBuffer not taken into consideration
														
 
															+#define SMEM_SIZE_PER_WARP_2BIT    \
														
 
															+  (WEIGHT_PER_WARP * BIT_WIDTH_2 / \
														
 
															+   8)  // 1024 Bytes, doubleBuffer not taken into consideration
														
 
															+#define SMEM_SIZE_PER_WARP_4BIT    \
														
 
															+  (WEIGHT_PER_WARP * BIT_WIDTH_4 / \
														
 
															+   8)  // 2048 Bytes, doubleBuffer not taken into consideration
														
 
															+#define SMEM_SIZE_PER_TB_1BIT                            \
														
 
															+  (SMEM_SIZE_PER_WARP_1BIT * TilingConfig::BLOCK_WARPS * \
														
 
															+   PIPELINE_LEVEL_GMEM)  // #WARP=4; Trible-Buffer for 3-level pipeline for A
														
 
															+                         // = 6 KB;  double buffer for 2-level pipeline A= 4
														
 
															+                         // KB.
														
 
															+#define SMEM_SIZE_PER_TB_2BIT                            \
														
 
															+  (SMEM_SIZE_PER_WARP_2BIT * TilingConfig::BLOCK_WARPS * \
														
 
															+   PIPELINE_LEVEL_GMEM)  // #WARP=4; Trible-Buffer for 3-level pipeline for A
														
 
															+                         // = 12 KB; double buffer for 2-level pipeline A= 8
														
 
															+                         // KB.
														
 
															+#define SMEM_SIZE_PER_TB_4BIT                            \
														
 
															+  (SMEM_SIZE_PER_WARP_4BIT * TilingConfig::BLOCK_WARPS * \
														
 
															+   PIPELINE_LEVEL_GMEM)  // #WARP=4; Trible-Buffer for 3-level pipeline for A
														
 
															+                         // = 24 KB; double buffer for 2-level pipeline A= 16
														
 
															+                         // KB.
														
 
															+#define SMEM_SIZE_PER_TB_A_TILE                    \
														
 
															+  (SMEM_SIZE_PER_TB_1BIT + SMEM_SIZE_PER_TB_2BIT + \
														
 
															+   SMEM_SIZE_PER_TB_4BIT)  // used in fp6_linear.cu, Kernel_Ex().
														
 
															+/******************** Global Memory Layout For QUANTIZED DATA
														
 
															+ * *******************/
														
 
															+#define NUM_INT4_PER_WARP_1BIT (WEIGHT_PER_WARP * BIT_WIDTH_1 / 128)  // 32
														
 
															+#define NUM_INT4_PER_WARP_2BIT (WEIGHT_PER_WARP * BIT_WIDTH_2 / 128)  // 64
														
 
															+#define NUM_INT4_PER_WARP_4BIT (WEIGHT_PER_WARP * BIT_WIDTH_4 / 128)  // 128
														
 
															+
														
 
															+/*
														
 
															+ * C = A*B
														
 
															+ * A: row major with ahead-of-time layout transformation, FP6
														
 
															+ * B: col major, FP16
														
 
															+ * C: col major, FP16
														
 
															+ */
														
 
															+template <typename TilingConfig, typename OutputDataType, int EXPONENT,
														
 
															+          int MANTISSA>
														
 
															+__global__ void QUANT_GEMM_Kernel(const uint4* Weight, const half* Scales,
														
 
															+                                  const half* B, OutputDataType* C,
														
 
															+                                  const size_t M_Global, const size_t N_Global,
														
 
															+                                  const size_t K_Global, int Split_K) {
														
 
															+#ifdef DEBUG_MODE
														
 
															+  assert(K_Global % TilingConfig::TILE_K == 0);
														
 
															+  assert(M_Global % TilingConfig::TILE_M == 0);
														
 
															+  assert(gridDim.y == Split_K * (M_Global / TilingConfig::TILE_M));
														
 
															+#endif
														
 
															+  // 1+2+4 weight split
														
 
															+  constexpr int BIT_WIDTH = 1 + EXPONENT + MANTISSA;
														
 
															+  constexpr int USE_SEG_1BIT = BIT_WIDTH & 1;
														
 
															+  constexpr int USE_SEG_2BIT = BIT_WIDTH & 2;
														
 
															+  constexpr int USE_SEG_4BIT = BIT_WIDTH & 4;
														
 
															+  const uint4* Weight_1bit = Weight;
														
 
															+  const uint4* Weight_2bit =
														
 
															+      Weight_1bit +
														
 
															+      (USE_SEG_1BIT ? M_Global * K_Global * BIT_WIDTH_1 / 128 : 0);
														
 
															+  const uint4* Weight_4bit =
														
 
															+      Weight_2bit +
														
 
															+      (USE_SEG_2BIT ? M_Global * K_Global * BIT_WIDTH_2 / 128 : 0);
														
 
															+  // Dynamic shared memory for FP16 A tiles， 128 Bytes aligned
														
 
															+  extern __shared__ __align__(128) half smem[];
														
 
															+  half(*smem_array)[WARP_K + PADDING_SHARED_MEM_FOR_B_8] =
														
 
															+      reinterpret_cast<half(*)[WARP_K + PADDING_SHARED_MEM_FOR_B_8]>(
														
 
															+          smem + SMEM_SIZE_PER_TB_A_TILE /
														
 
															+                     2);  // Dynamic shared memory for FP16 B tiles
														
 
															+  __shared__ half
														
 
															+      QuantScales[64 *
														
 
															+                  TilingConfig::BLOCK_WARPS];  // static shared memory for
														
 
															+                                               // quantization scales, 64 row
														
 
															+                                               // per warp * 4 warps = 512 Bytes
														
 
															+  // Thread Block Mapping, considering SplitK
														
 
															+  const size_t BatchID = blockIdx.y / (M_Global / TilingConfig::TILE_M);
														
 
															+  const size_t x =
														
 
															+      blockIdx.x;  // Output Block ID: (BlockID_Row = y; BlockID_Col = x )
														
 
															+  const size_t y =
														
 
															+      blockIdx.y %
														
 
															+      (M_Global / TilingConfig::TILE_M);  // Output Block ID: (BlockID_Row = y;
														
 
															+                                          // BlockID_Col = x )
														
 
															+  const size_t Tile_Start_M = y * TilingConfig::TILE_M;
														
 
															+  const size_t Tile_Start_N = x * TilingConfig::TILE_N;
														
 
															+  const size_t NumColumnToCopy =
														
 
															+      (N_Global - Tile_Start_N) < TilingConfig::TILE_N
														
 
															+          ? (N_Global - Tile_Start_N)
														
 
															+          : TilingConfig::TILE_N;
														
 
															+  const size_t NumBlock_K = K_Global / TilingConfig::TILE_K;
														
 
															+  const size_t AverageNumBlock_K = NumBlock_K / Split_K;
														
 
															+  const size_t ExtraNumBlock_K = NumBlock_K - AverageNumBlock_K * Split_K;
														
 
															+  size_t NumIter = AverageNumBlock_K;
														
 
															+  size_t StartBlockID_K = AverageNumBlock_K * BatchID;
														
 
															+  if (BatchID < ExtraNumBlock_K) {
														
 
															+    NumIter++;
														
 
															+    StartBlockID_K += BatchID;
														
 
															+  } else
														
 
															+    StartBlockID_K += ExtraNumBlock_K;
														
 
															+  // Warp ID.
														
 
															+  const int warpId = threadIdx.x / WARP_SIZE;
														
 
															+  int WARP_i = warpId / TilingConfig::BLOCK_COL_WARPS;  // WARP_i: row number;
														
 
															+                                                        // WARP_j: column number
														
 
															+  // int WARP_j = warpId % TilingConfig::BLOCK_COL_WARPS;
														
 
															+  //  Global Memory Address for Matrix A (Weight)
														
 
															+  //  /////////////////////////////////////////////////////////////////////////
														
 
															+  //  StartPTR for each ThreadBlock(TB)
														
 
															+  const uint4* TB_StartGPTR_A_1BIT =
														
 
															+      Weight_1bit +
														
 
															+      (y * TilingConfig::BLOCK_ROW_WARPS) * NumBlock_K * NUM_INT4_PER_WARP_1BIT;
														
 
															+  const uint4* TB_StartGPTR_A_2BIT =
														
 
															+      Weight_2bit +
														
 
															+      (y * TilingConfig::BLOCK_ROW_WARPS) * NumBlock_K * NUM_INT4_PER_WARP_2BIT;
														
 
															+  const uint4* TB_StartGPTR_A_4BIT =
														
 
															+      Weight_4bit +
														
 
															+      (y * TilingConfig::BLOCK_ROW_WARPS) * NumBlock_K * NUM_INT4_PER_WARP_4BIT;
														
 
															+  // StartPTR for each WARP.
														
 
															+  const uint4* WARP_StartGPTR_A_1BIT =
														
 
															+      TB_StartGPTR_A_1BIT + WARP_i * NumBlock_K * NUM_INT4_PER_WARP_1BIT;
														
 
															+  const uint4* WARP_StartGPTR_A_2BIT =
														
 
															+      TB_StartGPTR_A_2BIT + WARP_i * NumBlock_K * NUM_INT4_PER_WARP_2BIT;
														
 
															+  const uint4* WARP_StartGPTR_A_4BIT =
														
 
															+      TB_StartGPTR_A_4BIT + WARP_i * NumBlock_K * NUM_INT4_PER_WARP_4BIT;
														
 
															+  // StartPTR for each WARP, considering SplitK
														
 
															+  const size_t WARP_Start_UnitID_K = StartBlockID_K;
														
 
															+  WARP_StartGPTR_A_1BIT += WARP_Start_UnitID_K * NUM_INT4_PER_WARP_1BIT;
														
 
															+  WARP_StartGPTR_A_2BIT += WARP_Start_UnitID_K * NUM_INT4_PER_WARP_2BIT;
														
 
															+  WARP_StartGPTR_A_4BIT += WARP_Start_UnitID_K * NUM_INT4_PER_WARP_4BIT;
														
 
															+  // Copying A tile from Global to Shared, using double-buffer
														
 
															+  // ////////////////////////////////////////////////////////// StartSPTR for
														
 
															+  // each ThreadBlock
														
 
															+  uint32_t* AFrag_1BIT_SPTR = reinterpret_cast<uint32_t*>(smem);
														
 
															+  uint32_t* AFrag_2BIT_SPTR = AFrag_1BIT_SPTR + SMEM_SIZE_PER_TB_1BIT / 4;
														
 
															+  uint32_t* AFrag_4BIT_SPTR =
														
 
															+      AFrag_2BIT_SPTR +
														
 
															+      SMEM_SIZE_PER_TB_2BIT /
														
 
															+          4;  // 8 buffers including double buffers, 12 for trible buffers
														
 
															+  // StartSPTR for each WARP
														
 
															+  AFrag_1BIT_SPTR += warpId * SMEM_SIZE_PER_WARP_1BIT / 4;
														
 
															+  AFrag_2BIT_SPTR += warpId * SMEM_SIZE_PER_WARP_2BIT / 4;
														
 
															+  AFrag_4BIT_SPTR += warpId * SMEM_SIZE_PER_WARP_4BIT / 4;
														
 
															+  // Pre-fetch of A tile
														
 
															+  for (int i = 0; i < PIPELINE_LEVEL_GMEM - 1; i++) {
														
 
															+    if (USE_SEG_1BIT)
														
 
															+      CopyFromGlobalToShared_A<SMEM_SIZE_PER_WARP_1BIT>(
														
 
															+          AFrag_1BIT_SPTR + i * SMEM_SIZE_PER_WARP_1BIT / 4 * 4,
														
 
															+          WARP_StartGPTR_A_1BIT);
														
 
															+    if (USE_SEG_2BIT)
														
 
															+      CopyFromGlobalToShared_A<SMEM_SIZE_PER_WARP_2BIT>(
														
 
															+          AFrag_2BIT_SPTR + i * SMEM_SIZE_PER_WARP_2BIT / 4 * 4,
														
 
															+          WARP_StartGPTR_A_2BIT);
														
 
															+    if (USE_SEG_4BIT)
														
 
															+      CopyFromGlobalToShared_A<SMEM_SIZE_PER_WARP_4BIT>(
														
 
															+          AFrag_4BIT_SPTR + i * SMEM_SIZE_PER_WARP_4BIT / 4 * 4,
														
 
															+          WARP_StartGPTR_A_4BIT);
														
 
															+    WARP_StartGPTR_A_1BIT += SMEM_SIZE_PER_WARP_1BIT / 16;
														
 
															+    WARP_StartGPTR_A_2BIT += SMEM_SIZE_PER_WARP_2BIT / 16;
														
 
															+    WARP_StartGPTR_A_4BIT += SMEM_SIZE_PER_WARP_4BIT / 16;
														
 
															+  }
														
 
															+  // Global Memory Address for Matrix A (QuantScale)
														
 
															+  // /////////////////////////////////////////////////////////////////////
														
 
															+  const half* TB_StartGPTR_A_Scale =
														
 
															+      Scales + (y * TilingConfig::BLOCK_ROW_WARPS) * 64;
														
 
															+  const half* WARP_StartGPTR_A_Scales = TB_StartGPTR_A_Scale + WARP_i * 64;
														
 
															+  CopyFromGlobalToShared_Scales(QuantScales + WARP_i * 64,
														
 
															+                                WARP_StartGPTR_A_Scales);
														
 
															+  // Copying B tile from Global to Shared, considering SplitK
														
 
															+  // /////////////////////////////////////////////////////////////
														
 
															+  const half* BTile_GPTR =
														
 
															+      B + Tile_Start_N * K_Global + StartBlockID_K * TilingConfig::TILE_K;
														
 
															+  for (int i = 0; i < PIPELINE_LEVEL_GMEM - 1; i++) {
														
 
															+    CopyFromGlobalToShared<TilingConfig::TILE_N, TilingConfig::BLOCK_WARPS>(
														
 
															+        smem_array + i * TilingConfig::TILE_N, BTile_GPTR, K_Global,
														
 
															+        NumColumnToCopy);
														
 
															+    BTile_GPTR += TilingConfig::TILE_K;
														
 
															+  }
														
 
															+  // Register Allocation for A,B, and C, Initilazed to Zeros
														
 
															+  // /////////////////////////////////////////////////////////////////////
														
 
															+  constexpr int NumRegSets_a =
														
 
															+      WARP_ROW_MMA_TENSORS;  // 1 set = 4 registers, containing a 16*16 MMA
														
 
															+                             // block
														
 
															+  constexpr int NumRegSets_b =
														
 
															+      (TilingConfig::WARP_COL_MMA_TENSORS == 1)
														
 
															+          ? 1
														
 
															+          : TilingConfig::WARP_COL_MMA_TENSORS /
														
 
															+                2;  // 1 set = 4 registers, containing a 16*16 MMA block
														
 
															+  uint32_t a[NumRegSets_a * PIPELINE_LEVEL_SMEM]
														
 
															+            [4];  // double/Trible buffer is used // Registers to store
														
 
															+                  // decompressed FP6
														
 
															+  uint32_t b[NumRegSets_b * PIPELINE_LEVEL_SMEM]
														
 
															+            [4];  // double/Triple buffer is used // Register to store FP16 B
														
 
															+                  // matrix (a slice)
														
 
															+  float c[NumRegSets_a * NumRegSets_b][REG_PER_THREAD_C_TENSOR_16_16];
														
 
															+  for (int i = 0; i < NumRegSets_a * NumRegSets_b; i++)
														
 
															+    for (int j = 0; j < REG_PER_THREAD_C_TENSOR_16_16; j++) c[i][j] = 0.0f;
														
 
															+  //
														
 
															+  cp_async_wait_all();
														
 
															+  __syncthreads();
														
 
															+
														
 
															+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
														
 
															+  uint32_t Scales_RPTR[4];  // 4 Registers per thread for Quantization Scales
														
 
															+  ExtractFromSharedToReg_Scales(Scales_RPTR, QuantScales + WARP_i * 64);
														
 
															+  // Initializing the Software Pipeline: writing registers.
														
 
															+  // ////////////////////////////////////////////////////////////////////////////////////////////////
														
 
															+  initialize_mma_slice<TilingConfig, EXPONENT, MANTISSA>(
														
 
															+      a, b, AFrag_1BIT_SPTR, AFrag_2BIT_SPTR, AFrag_4BIT_SPTR, smem_array,
														
 
															+      Scales_RPTR);
														
 
															+// The outer loop.
														
 
															+// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
														
 
															+#pragma unroll(1)
														
 
															+  for (size_t tile_id_k = 0; tile_id_k < NumIter; tile_id_k++) {
														
 
															+    // Trible-Buffer for A Tile
														
 
															+    uint32_t* __restrict__ read_SPTR_Frag_1bit =
														
 
															+        AFrag_1BIT_SPTR +
														
 
															+        ((tile_id_k + 0) % PIPELINE_LEVEL_GMEM) * SMEM_SIZE_PER_WARP_1BIT / 4 *
														
 
															+            4;  // 512  (1)*4: 4 WARPs; (2)/4: int*+1 = char*+16
														
 
															+    uint32_t* __restrict__ read_SPTR_Frag_2bit =
														
 
															+        AFrag_2BIT_SPTR +
														
 
															+        ((tile_id_k + 0) % PIPELINE_LEVEL_GMEM) * SMEM_SIZE_PER_WARP_2BIT / 4 *
														
 
															+            4;  // 1024 (1)*4: 4 WARPs; (2)/4: int*+1 = char*+16
														
 
															+    uint32_t* __restrict__ read_SPTR_Frag_4bit =
														
 
															+        AFrag_4BIT_SPTR +
														
 
															+        ((tile_id_k + 0) % PIPELINE_LEVEL_GMEM) * SMEM_SIZE_PER_WARP_4BIT / 4 *
														
 
															+            4;  // 2048 (1)*4: 4 WARPs; (2)/4: int*+1 = char*+16
														
 
															+    uint32_t* __restrict__ read2_SPTR_Frag_1bit =
														
 
															+        AFrag_1BIT_SPTR + ((tile_id_k + 1) % PIPELINE_LEVEL_GMEM) *
														
 
															+                              SMEM_SIZE_PER_WARP_1BIT / 4 * 4;
														
 
															+    uint32_t* __restrict__ read2_SPTR_Frag_2bit =
														
 
															+        AFrag_2BIT_SPTR + ((tile_id_k + 1) % PIPELINE_LEVEL_GMEM) *
														
 
															+                              SMEM_SIZE_PER_WARP_2BIT / 4 * 4;
														
 
															+    uint32_t* __restrict__ read2_SPTR_Frag_4bit =
														
 
															+        AFrag_4BIT_SPTR + ((tile_id_k + 1) % PIPELINE_LEVEL_GMEM) *
														
 
															+                              SMEM_SIZE_PER_WARP_4BIT / 4 * 4;
														
 
															+    uint32_t* __restrict__ write_SPTR_Frag_1bit =
														
 
															+        AFrag_1BIT_SPTR +
														
 
															+        ((tile_id_k + (PIPELINE_LEVEL_GMEM - 1)) % PIPELINE_LEVEL_GMEM) *
														
 
															+            SMEM_SIZE_PER_WARP_1BIT / 4 *
														
 
															+            4;  // 512  (1)*4: 4 WARPs; (2)/4: int*+1 = char*+16
														
 
															+    uint32_t* __restrict__ write_SPTR_Frag_2bit =
														
 
															+        AFrag_2BIT_SPTR +
														
 
															+        ((tile_id_k + (PIPELINE_LEVEL_GMEM - 1)) % PIPELINE_LEVEL_GMEM) *
														
 
															+            SMEM_SIZE_PER_WARP_2BIT / 4 *
														
 
															+            4;  // 1024 (1)*4: 4 WARPs; (2)/4: int*+1 = char*+16
														
 
															+    uint32_t* __restrict__ write_SPTR_Frag_4bit =
														
 
															+        AFrag_4BIT_SPTR +
														
 
															+        ((tile_id_k + (PIPELINE_LEVEL_GMEM - 1)) % PIPELINE_LEVEL_GMEM) *
														
 
															+            SMEM_SIZE_PER_WARP_4BIT / 4 *
														
 
															+            4;  // 2048 (1)*4: 4 WARPs; (2)/4: int*+1 = char*+16
														
 
															+    // Trible-Buffer for B Tile
														
 
															+    // MODIFICATION NOTE: to support MSVC, half __restrict__ (*read_SPTR ) is
														
 
															+    // changed to below. similarly for read2_SPTR and write_SPTR.
														
 
															+    half(*__restrict__ read_SPTR)[WARP_K + PADDING_SHARED_MEM_FOR_B_8] =
														
 
															+        smem_array +
														
 
															+        ((tile_id_k + 0) % PIPELINE_LEVEL_GMEM) * TilingConfig::TILE_N;
														
 
															+    half(*__restrict__ read2_SPTR)[WARP_K + PADDING_SHARED_MEM_FOR_B_8] =
														
 
															+        smem_array +
														
 
															+        ((tile_id_k + 1) % PIPELINE_LEVEL_GMEM) * TilingConfig::TILE_N;
														
 
															+    half(*__restrict__ write_SPTR)[WARP_K + PADDING_SHARED_MEM_FOR_B_8] =
														
 
															+        smem_array +
														
 
															+        ((tile_id_k + (PIPELINE_LEVEL_GMEM - 1)) % PIPELINE_LEVEL_GMEM) *
														
 
															+            TilingConfig::TILE_N;
														
 
															+    //
														
 
															+    bool GlobalCopy = (tile_id_k + PIPELINE_LEVEL_GMEM - 1) < NumIter;
														
 
															+    // Copying A tile from Global to Register, Bypassing L1, using double-buffer
														
 
															+    if (USE_SEG_1BIT)
														
 
															+      CopyFromGlobalToShared_A<SMEM_SIZE_PER_WARP_1BIT>(
														
 
															+          write_SPTR_Frag_1bit, WARP_StartGPTR_A_1BIT, GlobalCopy);
														
 
															+    if (USE_SEG_2BIT)
														
 
															+      CopyFromGlobalToShared_A<SMEM_SIZE_PER_WARP_2BIT>(
														
 
															+          write_SPTR_Frag_2bit, WARP_StartGPTR_A_2BIT, GlobalCopy);
														
 
															+    if (USE_SEG_4BIT)
														
 
															+      CopyFromGlobalToShared_A<SMEM_SIZE_PER_WARP_4BIT>(
														
 
															+          write_SPTR_Frag_4bit, WARP_StartGPTR_A_4BIT, GlobalCopy);
														
 
															+    // copying B tile from GlobalMemory to SharedMemory
														
 
															+    CopyFromGlobalToShared<TilingConfig::TILE_N, TilingConfig::BLOCK_WARPS>(
														
 
															+        write_SPTR, BTile_GPTR, K_Global, NumColumnToCopy, GlobalCopy);
														
 
															+    cp_async_group_commit();
														
 
															+    core_mma_slice<TilingConfig, EXPONENT, MANTISSA>(
														
 
															+        c, a, b, read_SPTR_Frag_1bit, read_SPTR_Frag_2bit, read_SPTR_Frag_4bit,
														
 
															+        read_SPTR, Scales_RPTR,
														
 
															+        1);  // read_SPTR_Frag_2bit, read_SPTR_Frag_4bit are different for each
														
 
															+             // WARP; read_SPTR is shared among WARPs
														
 
															+    core_mma_slice<TilingConfig, EXPONENT, MANTISSA>(
														
 
															+        c, a, b, read_SPTR_Frag_1bit, read_SPTR_Frag_2bit, read_SPTR_Frag_4bit,
														
 
															+        read_SPTR, Scales_RPTR, 2);
														
 
															+    core_mma_slice<TilingConfig, EXPONENT, MANTISSA>(
														
 
															+        c, a, b, read_SPTR_Frag_1bit, read_SPTR_Frag_2bit, read_SPTR_Frag_4bit,
														
 
															+        read_SPTR, Scales_RPTR, 3);
														
 
															+    // Barriers and Synchronizations
														
 
															+    cp_async_wait_group<PIPELINE_LEVEL_GMEM - 2>();
														
 
															+    __syncthreads();
														
 
															+    core_mma_slice<TilingConfig, EXPONENT, MANTISSA>(
														
 
															+        c, a, b, read2_SPTR_Frag_1bit, read2_SPTR_Frag_2bit,
														
 
															+        read2_SPTR_Frag_4bit, read2_SPTR, Scales_RPTR, 0);
														
 
															+    // Updating global PTRs
														
 
															+    WARP_StartGPTR_A_1BIT +=
														
 
															+        SMEM_SIZE_PER_WARP_1BIT / 16;  // 2KB/16=128 (1)/16: int4*+1 = char*+16
														
 
															+    WARP_StartGPTR_A_2BIT +=
														
 
															+        SMEM_SIZE_PER_WARP_2BIT / 16;  // 4KB/16=256 (1)/16: int4*+1 = char*+16
														
 
															+    WARP_StartGPTR_A_4BIT +=
														
 
															+        SMEM_SIZE_PER_WARP_4BIT / 16;  // 8KB/16=512 (1)/16: int4*+1 = char*+16
														
 
															+    BTile_GPTR += TilingConfig::TILE_K;
														
 
															+  }
														
 
															+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
														
 
															+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
														
 
															+  // Store the C fragments to shared memory.
														
 
															+  float(*smem_CFrag)[TilingConfig::TILE_M + PADDING_SHARED_MEM_FOR_C_4] =
														
 
															+      reinterpret_cast<
														
 
															+          float(*)[TilingConfig::TILE_M + PADDING_SHARED_MEM_FOR_C_4]>(smem);
														
 
															+  StoreToSharedMemoryFromRegister<TilingConfig>(smem_CFrag, c);
														
 
															+  __syncthreads();
														
 
															+  // Now that shared memory contains all the D tiles, stream them to global
														
 
															+  // memory.
														
 
															+  OutputDataType* BlockGlobalPTR = C + BatchID * (M_Global * N_Global) +
														
 
															+                                   Tile_Start_M + Tile_Start_N * M_Global;
														
 
															+  for (size_t i = warpId; i < NumColumnToCopy;
														
 
															+       i += TilingConfig::BLOCK_WARPS)  // i-th column
														
 
															+#pragma unroll
														
 
															+    for (size_t j = threadIdx.x % WARP_SIZE; j < TilingConfig::TILE_M;
														
 
															+         j += WARP_SIZE)  // j-th row
														
 
															+    {
														
 
															+      if constexpr (std::is_same<OutputDataType, half>::value)
														
 
															+        BlockGlobalPTR[j + i * M_Global] = __float2half_rn(smem_CFrag[i][j]);
														
 
															+      else
														
 
															+        BlockGlobalPTR[j + i * M_Global] = smem_CFrag[i][j];
														
 
															+    }
														
 
															+}
														
--- a/kernels/quantization/fp6/kernel_reduction.cuh
+++ b/kernels/quantization/fp6/kernel_reduction.cuh
@@ -0,0 +1,70 @@
 
															+//    Copyright 2024 FP6-LLM authors
														
 
															+//
														
 
															+//    Licensed under the Apache License, Version 2.0 (the "License");
														
 
															+//    you may not use this file except in compliance with the License.
														
 
															+//    You may obtain a copy of the License at
														
 
															+//
														
 
															+//        http://www.apache.org/licenses/LICENSE-2.0
														
 
															+//
														
 
															+//    Unless required by applicable law or agreed to in writing, software
														
 
															+//    distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+//    See the License for the specific language governing permissions and
														
 
															+//    limitations under the License.
														
 
															+//
														
 
															+// This file is copied from
														
 
															+// https://github.com/usyd-fsalab/fp6_llm/blob/ce76774bcfc26b325c1b558abcf1935026d9abbc/fp6_llm/csrc/include/kernel_reduction.cuh
														
 
															+
														
 
															+/***************************************************************************
														
 
															+ * Copyright 2023 The FLash-LLM Authors. All rights reserved.
														
 
															+ * Licensed under the Apache License, Version 2.0 (the "License");
														
 
															+ * you may not use this file except in compliance with the License.
														
 
															+ * You may obtain a copy of the License at
														
 
															+ * http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ ***************************************************************************/
														
 
															+// Used for the reduction of result matrix if Split-K is used
														
 
															+// Reduction_Workspace:     (Split_K, M_Global, N_Global),  column major
														
 
															+// C:                       (M_Global, N_Global),           column major
														
 
															+// Each thread deals with 8 output elements, each elements is the sum of Split_K
														
 
															+// elements
														
 
															+//      Read  Global: Each Warp/ThreadBlock: 32 threads_per_warp * 8
														
 
															+//      float_per_thread (256bit) -> 256 float per warp Write Global: Each
														
 
															+//      Warp/ThreadBlock: 32 threads_per_warp * 8 half_per_thread  (128bit) ->
														
 
															+//      256 half  per warp
														
 
															+// GridSize = (M_Global*N_Global) / 256
														
 
															+
														
 
															+#include <cuda.h>
														
 
															+#include <cuda_fp16.h>
														
 
															+#include <cuda_runtime.h>
														
 
															+
														
 
															+#define REDUCTION_ELEMENT_PER_THREADBLOCK 256
														
 
															+#define HALF_PER_128BIT 8
														
 
															+
														
 
															+__global__ void SplitK_Reduction(half* C, float* Reduction_Workspace,
														
 
															+                                 size_t M_Global, size_t N_Global,
														
 
															+                                 int Split_K) {
														
 
															+  half* WARP_GPTR_C = C + REDUCTION_ELEMENT_PER_THREADBLOCK * blockIdx.x;
														
 
															+  float* WARP_GPTR_R =
														
 
															+      Reduction_Workspace + REDUCTION_ELEMENT_PER_THREADBLOCK * blockIdx.x;
														
 
															+  half* THREAD_GPTR_C = WARP_GPTR_C + threadIdx.x * HALF_PER_128BIT;
														
 
															+  float* THREAD_GPTR_R = WARP_GPTR_R + threadIdx.x * HALF_PER_128BIT;
														
 
															+  // Initializing Thread-Local Results
														
 
															+  float Results[HALF_PER_128BIT];
														
 
															+#pragma unroll
														
 
															+  for (int i = 0; i < HALF_PER_128BIT; i++) Results[i] = 0.0f;
														
 
															+  // Reduction
														
 
															+  for (int i = 0; i < Split_K; i++) {
														
 
															+#pragma unroll
														
 
															+    for (int j = 0; j < HALF_PER_128BIT; j++) Results[j] += THREAD_GPTR_R[j];
														
 
															+    THREAD_GPTR_R += M_Global * N_Global;
														
 
															+  }
														
 
															+// Writing to global memory
														
 
															+#pragma unroll
														
 
															+  for (int i = 0; i < HALF_PER_128BIT; i++)
														
 
															+    THREAD_GPTR_C[i] = __float2half_rn(Results[i]);
														
 
															+}
														
--- a/kernels/quantization/fp6/ptx_cp.async.cuh
+++ b/kernels/quantization/fp6/ptx_cp.async.cuh
@@ -0,0 +1,82 @@
 
															+//    Copyright 2024 FP6-LLM authors
														
 
															+//
														
 
															+//    Licensed under the Apache License, Version 2.0 (the "License");
														
 
															+//    you may not use this file except in compliance with the License.
														
 
															+//    You may obtain a copy of the License at
														
 
															+//
														
 
															+//        http://www.apache.org/licenses/LICENSE-2.0
														
 
															+//
														
 
															+//    Unless required by applicable law or agreed to in writing, software
														
 
															+//    distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+//    See the License for the specific language governing permissions and
														
 
															+//    limitations under the License.
														
 
															+//
														
 
															+// This file is copied from
														
 
															+// https://github.com/usyd-fsalab/fp6_llm/blob/ce76774bcfc26b325c1b558abcf1935026d9abbc/fp6_llm/csrc/include/ptx_cp.async.cuh
														
 
															+
														
 
															+/***************************************************************************
														
 
															+ * Copyright 2023 The FLash-LLM Authors. All rights reserved.
														
 
															+ * Licensed under the Apache License, Version 2.0 (the "License");
														
 
															+ * you may not use this file except in compliance with the License.
														
 
															+ * You may obtain a copy of the License at
														
 
															+ * http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ ***************************************************************************/
														
 
															+// Extended from CUTLASS's source code
														
 
															+
														
 
															+#ifndef PTX_CP_ASYNC_CUH
														
 
															+#define PTX_CP_ASYNC_CUH
														
 
															+
														
 
															+#include <cuda.h>
														
 
															+#include <cuda_fp16.h>
														
 
															+#include <cuda_runtime.h>
														
 
															+
														
 
															+template <int SizeInBytes>
														
 
															+__device__ __forceinline__ void cp_async(half* smem_ptr, const half* global_ptr,
														
 
															+                                         bool pred_guard = true) {
														
 
															+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
														
 
															+  static_assert(SizeInBytes == 16, "Size is not supported");
														
 
															+  unsigned smem_int_ptr = __cvta_generic_to_shared(smem_ptr);
														
 
															+  asm volatile(
														
 
															+      "{ \n"
														
 
															+      "  .reg .pred p;\n"
														
 
															+      "  setp.ne.b32 p, %0, 0;\n"
														
 
															+      "  @p cp.async.cg.shared.global [%1], [%2], %3;\n"
														
 
															+      "}\n" ::"r"((int)pred_guard),
														
 
															+      "r"(smem_int_ptr), "l"(global_ptr), "n"(SizeInBytes));
														
 
															+#endif
														
 
															+}
														
 
															+
														
 
															+/// Establishes an ordering w.r.t previously issued cp.async instructions. Does
														
 
															+/// not block.
														
 
															+__device__ __forceinline__ void cp_async_group_commit() {
														
 
															+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
														
 
															+  asm volatile("cp.async.commit_group;\n" ::);
														
 
															+#endif
														
 
															+}
														
 
															+
														
 
															+/// Blocks until all but <N> previous cp.async.commit_group operations have
														
 
															+/// committed.
														
 
															+template <int N>
														
 
															+__device__ __forceinline__ void cp_async_wait_group() {
														
 
															+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
														
 
															+  asm volatile("cp.async.wait_group %0;\n" ::"n"(N));
														
 
															+#endif
														
 
															+}
														
 
															+
														
 
															+/// Blocks until all previous cp.async.commit_group operations have committed.
														
 
															+// cp.async.wait_all is equivalent to :
														
 
															+// cp.async.commit_group;
														
 
															+// cp.async.wait_group 0;
														
 
															+__device__ __forceinline__ void cp_async_wait_all() {
														
 
															+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
														
 
															+  asm volatile("cp.async.wait_all;\n" ::);
														
 
															+#endif
														
 
															+}
														
 
															+
														
 
															+#endif
														
--- a/kernels/quantization/fp6/ptx_mma.cuh
+++ b/kernels/quantization/fp6/ptx_mma.cuh
@@ -0,0 +1,108 @@
 
															+//    Copyright 2024 FP6-LLM authors
														
 
															+//
														
 
															+//    Licensed under the Apache License, Version 2.0 (the "License");
														
 
															+//    you may not use this file except in compliance with the License.
														
 
															+//    You may obtain a copy of the License at
														
 
															+//
														
 
															+//        http://www.apache.org/licenses/LICENSE-2.0
														
 
															+//
														
 
															+//    Unless required by applicable law or agreed to in writing, software
														
 
															+//    distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+//    See the License for the specific language governing permissions and
														
 
															+//    limitations under the License.
														
 
															+//
														
 
															+// This file is modified from
														
 
															+// https://github.com/usyd-fsalab/fp6_llm/blob/5df6737cca32f604e957e3f63f03ccc2e4d1df0d/fp6_llm/csrc/include/ptx_mma.cuh
														
 
															+
														
 
															+/***************************************************************************
														
 
															+ * Copyright 2023 The FLash-LLM Authors. All rights reserved.
														
 
															+ * Licensed under the Apache License, Version 2.0 (the "License");
														
 
															+ * you may not use this file except in compliance with the License.
														
 
															+ * You may obtain a copy of the License at
														
 
															+ * http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ ***************************************************************************/
														
 
															+#ifndef PTX_MMA_CUH
														
 
															+#define PTX_MMA_CUH
														
 
															+
														
 
															+#include <cuda.h>
														
 
															+#include <cuda_fp16.h>
														
 
															+#include <cuda_runtime.h>
														
 
															+
														
 
															+#include <assert.h>
														
 
															+#include "configs.h"
														
 
															+
														
 
															+// MODIFICATION NOTE: to support MSVC
														
 
															+// - uint32_t __restrict__ Reg[][4] is changed to uint32_t (* __restrict__
														
 
															+// Reg)[4]
														
 
															+// - half __restrict__ (*read_SPTR) is changed to half (* __restrict__
														
 
															+// read_SPTR)
														
 
															+template <typename TilingConfig>
														
 
															+__device__ __forceinline__ void B_FromSharedToReg(
														
 
															+    uint32_t (*__restrict__ Reg)[4],
														
 
															+    half (*__restrict__ read_SPTR)[WARP_K + PADDING_SHARED_MEM_FOR_B_8],
														
 
															+    int slice_id) {
														
 
															+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
														
 
															+  #ifdef DEBUG_MODE
														
 
															+  static_assert((TilingConfig::WARP_COL_MMA_TENSORS == 1) ||
														
 
															+                (TilingConfig::WARP_COL_MMA_TENSORS % 2 == 0));
														
 
															+  #endif
														
 
															+
														
 
															+  const int warpId = threadIdx.x / WARP_SIZE;
														
 
															+  int lane_id = threadIdx.x % WARP_SIZE;
														
 
															+  int WARP_j = warpId % TilingConfig::BLOCK_COL_WARPS;
														
 
															+  int warp_start_col =
														
 
															+      TilingConfig::WARP_COL_MMA_TENSORS * MMA_8 *
														
 
															+      WARP_j;  // each warp may start from reading warp_start_col'th column of
														
 
															+               // the B tile in shared memory
														
 
															+  #ifdef DEBUG_MODE
														
 
															+  assert(warp_start_col == 0);
														
 
															+  #endif
														
 
															+
														
 
															+  int col = (lane_id % 8) + (lane_id / 16) * 8;
														
 
															+  int row = (lane_id % 16) / 8 * 8;
														
 
															+  uint32_t smem_local_ptr = static_cast<uint32_t>(__cvta_generic_to_shared(
														
 
															+      &read_SPTR[warp_start_col + col][slice_id * MMA_16 + row]));
														
 
															+  if (TilingConfig::WARP_COL_MMA_TENSORS == 1) {
														
 
															+    asm volatile("ldmatrix.sync.aligned.x2.m8n8.shared.b16 {%0, %1}, [%2];\n"
														
 
															+                 : "=r"(Reg[0][0]), "=r"(Reg[0][1])
														
 
															+                 : "r"(smem_local_ptr));
														
 
															+  } else {
														
 
															+  #pragma unroll
														
 
															+    for (int i = 0; i < TilingConfig::WARP_COL_MMA_TENSORS / 2; i++) {
														
 
															+      asm volatile(
														
 
															+          "ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];\n"
														
 
															+          : "=r"(Reg[i][0]), "=r"(Reg[i][1]), "=r"(Reg[i][2]), "=r"(Reg[i][3])
														
 
															+          : "r"(smem_local_ptr));
														
 
															+      smem_local_ptr +=
														
 
															+          16 * (WARP_K + PADDING_SHARED_MEM_FOR_B_8) * sizeof(half);
														
 
															+    }
														
 
															+  }
														
 
															+#endif
														
 
															+}
														
 
															+
														
 
															+// MODIFICATION NOTE: to support MSVC, the function signature is changed from
														
 
															+// MMA_FP16_M16N8K16(uint32_t __restrict__ c[], uint32_t __restrict__ *a,
														
 
															+// uint32_t __restrict__ *b).
														
 
															+__device__ __forceinline__ void MMA_FP16_M16N8K16(uint32_t* __restrict__ c,
														
 
															+                                                  uint32_t* __restrict__ a,
														
 
															+                                                  uint32_t* __restrict__ b) {
														
 
															+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
														
 
															+  asm volatile(
														
 
															+      "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
														
 
															+      "{ %0, %1, %2, %3},"
														
 
															+      "{ %4, %5, %6, %7 },"
														
 
															+      "{ %8, %9 },"
														
 
															+      "{ %10, %11, %12, %13 };"
														
 
															+      : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
														
 
															+      : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
														
 
															+        "r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3]));
														
 
															+#endif
														
 
															+}
														
 
															+
														
 
															+#endif  // PTX_MMA_CUH
														
--- a/kernels/quantization/fp6/utils_core.cuh
+++ b/kernels/quantization/fp6/utils_core.cuh
@@ -0,0 +1,188 @@
 
															+//    Copyright 2024 FP6-LLM authors
														
 
															+//
														
 
															+//    Licensed under the Apache License, Version 2.0 (the "License");
														
 
															+//    you may not use this file except in compliance with the License.
														
 
															+//    You may obtain a copy of the License at
														
 
															+//
														
 
															+//        http://www.apache.org/licenses/LICENSE-2.0
														
 
															+//
														
 
															+//    Unless required by applicable law or agreed to in writing, software
														
 
															+//    distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+//    See the License for the specific language governing permissions and
														
 
															+//    limitations under the License.
														
 
															+//
														
 
															+// This file is modified from
														
 
															+// https://github.com/usyd-fsalab/fp6_llm/blob/5df6737cca32f604e957e3f63f03ccc2e4d1df0d/fp6_llm/csrc/include/utils_core.cuh
														
 
															+
														
 
															+#ifndef UTILS_CORE_CUH
														
 
															+#define UTILS_CORE_CUH
														
 
															+
														
 
															+#include <assert.h>
														
 
															+
														
 
															+#include "configs.h"
														
 
															+#include "ptx_mma.cuh"
														
 
															+#include "utils_parallel_dequant.cuh"
														
 
															+
														
 
															+template <int NUM_INT_PER_THREAD>
														
 
															+__device__ __forceinline__ void CopyFromSharedToRegister_AFrag(uint32_t Reg[],
														
 
															+                                                               uint32_t* SPTR,
														
 
															+                                                               int slice_id) {
														
 
															+  SPTR += slice_id * (NUM_INT_PER_THREAD * WARP_SIZE);
														
 
															+  int lane_id = threadIdx.x % WARP_SIZE;
														
 
															+#pragma unroll
														
 
															+  for (int i = 0; i < NUM_INT_PER_THREAD; i++) {
														
 
															+    Reg[i] = SPTR[lane_id + i * WARP_SIZE];
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															+// MODIFICATION NOTE: to support MSVC, half __restrict__
														
 
															+// (*B_SPTR_read)[WARP_K+PADDING_SHARED_MEM_FOR_B_8] is changed to below.
														
 
															+template <typename TilingConfig, int EXPONENT, int MANTISSA>
														
 
															+__device__ __forceinline__ void initialize_mma_slice(
														
 
															+    uint32_t (*a)[4], uint32_t (*b)[4], uint32_t* __restrict__ A_1BIT_SPTR_read,
														
 
															+    uint32_t* __restrict__ A_2BIT_SPTR_read,
														
 
															+    uint32_t* __restrict__ A_4BIT_SPTR_read,
														
 
															+    half (*__restrict__ B_SPTR_read)[WARP_K + PADDING_SHARED_MEM_FOR_B_8],
														
 
															+    uint32_t* RPTR_Scales) {
														
 
															+  // 1+2+4 weight split
														
 
															+  constexpr int BIT_WIDTH = 1 + EXPONENT + MANTISSA;
														
 
															+  constexpr int USE_SEG_1BIT = BIT_WIDTH & 1;
														
 
															+  constexpr int USE_SEG_2BIT = BIT_WIDTH & 2;
														
 
															+  constexpr int USE_SEG_4BIT = BIT_WIDTH & 4;
														
 
															+  // Writing registers
														
 
															+  // Registers to store FP6 fragments for a slice (64*16) of A matrix => 32 FP6
														
 
															+  // per thread => 6 register per thread;
														
 
															+  uint32_t a_1bit[1];  // NO double buffer
														
 
															+  uint32_t a_2bit[2];  // NO double buffer
														
 
															+  uint32_t a_4bit[4];  // NO double buffer
														
 
															+  if (USE_SEG_1BIT)
														
 
															+    CopyFromSharedToRegister_AFrag<1>(a_1bit, A_1BIT_SPTR_read, 0);
														
 
															+  if (USE_SEG_2BIT)
														
 
															+    CopyFromSharedToRegister_AFrag<2>(a_2bit, A_2BIT_SPTR_read, 0);
														
 
															+  if (USE_SEG_4BIT)
														
 
															+    CopyFromSharedToRegister_AFrag<4>(a_4bit, A_4BIT_SPTR_read, 0);
														
 
															+  Dequant_32FP6_4Way<EXPONENT, MANTISSA>(
														
 
															+      a, a_1bit, a_2bit, a_4bit,
														
 
															+      RPTR_Scales);  // SIMT Dequant: dequantizing FPx to FP16 at register
														
 
															+                     // level, dequantizing a slice each time
														
 
															+  B_FromSharedToReg<TilingConfig>(b, B_SPTR_read,
														
 
															+                                  0);  // Loading B from shared to registers
														
 
															+}
														
 
															+
														
 
															+// MODIFICATION NOTE: to support MSVC, half __restrict__
														
 
															+// (*B_SPTR_read)[WARP_K+PADDING_SHARED_MEM_FOR_B_8] is changed to below.
														
 
															+template <typename TilingConfig, int EXPONENT, int MANTISSA>
														
 
															+__device__ __forceinline__ void core_mma_slice(
														
 
															+    float c[][REG_PER_THREAD_C_TENSOR_16_16], uint32_t (*a)[4],
														
 
															+    uint32_t (*b)[4], uint32_t* __restrict__ A_1bit_SPTR_read,
														
 
															+    uint32_t* __restrict__ A_2bit_SPTR_read,
														
 
															+    uint32_t* __restrict__ A_4bit_SPTR_read,
														
 
															+    half (*__restrict__ B_SPTR_read)[WARP_K + PADDING_SHARED_MEM_FOR_B_8],
														
 
															+    uint32_t* RPTR_Scales,
														
 
															+    int slice_id)  // writing slice[slice_id] to registers, k=0 -> slice_id=1
														
 
															+                   // for prefetching
														
 
															+{
														
 
															+  // 1+2+4 weight split
														
 
															+  constexpr int BIT_WIDTH = 1 + EXPONENT + MANTISSA;
														
 
															+  constexpr int USE_SEG_1BIT = BIT_WIDTH & 1;
														
 
															+  constexpr int USE_SEG_2BIT = BIT_WIDTH & 2;
														
 
															+  constexpr int USE_SEG_4BIT = BIT_WIDTH & 4;
														
 
															+
														
 
															+#ifdef DEBUG_MODE
														
 
															+  assert((TilingConfig::WARP_COL_MMA_TENSORS == 1) ||
														
 
															+         (TilingConfig::WARP_COL_MMA_TENSORS % 2 ==
														
 
															+          0));  // if WARP_COL_MMA_TENSORS == 1, B tile in registers is padded
														
 
															+                // to a 16*16 MMA block
														
 
															+#endif
														
 
															+  const int NumRegSets_a =
														
 
															+      WARP_ROW_MMA_TENSORS;  // 1 set = 4 registers, containing a 16*16 MMA
														
 
															+                             // block
														
 
															+  const int NumRegSets_b =
														
 
															+      (TilingConfig::WARP_COL_MMA_TENSORS == 1)
														
 
															+          ? 1
														
 
															+          : TilingConfig::WARP_COL_MMA_TENSORS /
														
 
															+                2;  // 1 set = 4 registers, containing a 16*16 MMA block
														
 
															+  uint32_t(*c_uint_ptr)[REG_PER_THREAD_C_TENSOR_16_16] =
														
 
															+      reinterpret_cast<uint32_t(*)[REG_PER_THREAD_C_TENSOR_16_16]>(
														
 
															+          c);  // GlobalRegisters for accumulated FP32 results
														
 
															+
														
 
															+  // Setting RPTRs for double buffers
														
 
															+  uint32_t(*a_read)[4] = a;
														
 
															+  uint32_t(*a_write)[4] = a;
														
 
															+  uint32_t(*b_read)[4] = b;
														
 
															+  uint32_t(*b_write)[4] = b;
														
 
															+  if (slice_id % 2 == 1) {
														
 
															+    b_write += NumRegSets_b;
														
 
															+    a_write += NumRegSets_a;
														
 
															+  } else {
														
 
															+    b_read += NumRegSets_b;
														
 
															+    a_read += NumRegSets_a;
														
 
															+  }
														
 
															+
														
 
															+// Reading registers and issuing core tensor core computations (a slice of A and
														
 
															+// B tile in shared memory)
														
 
															+#pragma unroll
														
 
															+  for (int i = 0; i < WARP_ROW_MMA_TENSORS; i++) {
														
 
															+    if (TilingConfig::WARP_COL_MMA_TENSORS == 1) {
														
 
															+      MMA_FP16_M16N8K16(c_uint_ptr[i], a_read[i], b_read[0]);
														
 
															+    } else {
														
 
															+#pragma unroll
														
 
															+      for (int j = 0; j < TilingConfig::WARP_COL_MMA_TENSORS / 2; j++) {
														
 
															+        MMA_FP16_M16N8K16(c_uint_ptr[i + j * WARP_ROW_MMA_TENSORS], a_read[i],
														
 
															+                          b_read[j]);
														
 
															+        MMA_FP16_M16N8K16(c_uint_ptr[i + j * WARP_ROW_MMA_TENSORS] + 4,
														
 
															+                          a_read[i], b_read[j] + 2);  // c+4; b+2
														
 
															+      }
														
 
															+    }
														
 
															+  }
														
 
															+  // Writing registers
														
 
															+  // Registers to store FP6 fragments for a slice (64*16) of A matrix => 32 FP6
														
 
															+  // per thread => 6 register per thread;
														
 
															+  uint32_t a_1bit[1];  // NO double buffer
														
 
															+  uint32_t a_2bit[2];  // NO double buffer
														
 
															+  uint32_t a_4bit[4];  // NO double buffer
														
 
															+  if (USE_SEG_1BIT)
														
 
															+    CopyFromSharedToRegister_AFrag<1>(a_1bit, A_1bit_SPTR_read, slice_id);
														
 
															+  if (USE_SEG_2BIT)
														
 
															+    CopyFromSharedToRegister_AFrag<2>(a_2bit, A_2bit_SPTR_read, slice_id);
														
 
															+  if (USE_SEG_4BIT)
														
 
															+    CopyFromSharedToRegister_AFrag<4>(a_4bit, A_4bit_SPTR_read, slice_id);
														
 
															+  Dequant_32FP6_4Way<EXPONENT, MANTISSA>(
														
 
															+      a_write, a_1bit, a_2bit, a_4bit,
														
 
															+      RPTR_Scales);  // SIMT Dequant: dequantizing FP6 to FP16 at register
														
 
															+                     // level, dequantizing a slice each time
														
 
															+  B_FromSharedToReg<TilingConfig>(
														
 
															+      b_write, B_SPTR_read, slice_id);  // Loading B from shared to registers
														
 
															+}
														
 
															+
														
 
															+template <typename TilingConfig>
														
 
															+__device__ __forceinline__ void StoreToSharedMemoryFromRegister(
														
 
															+    float (*smem_CFrag)[TilingConfig::TILE_M + PADDING_SHARED_MEM_FOR_C_4],
														
 
															+    float c[][REG_PER_THREAD_C_TENSOR_16_16]) {
														
 
															+  const int lane_id = threadIdx.x % WARP_SIZE;
														
 
															+  const int warpId = threadIdx.x / WARP_SIZE;
														
 
															+  int warp_row_offset = warpId * (MMA_16 * WARP_ROW_MMA_TENSORS);
														
 
															+#pragma unroll
														
 
															+  for (int i = 0; i < WARP_ROW_MMA_TENSORS; i++) {
														
 
															+#pragma unroll
														
 
															+    for (int j = 0; j < TilingConfig::WARP_COL_MMA_TENSORS;
														
 
															+         j++) {  // Dealing with one 16*8 Tensor
														
 
															+      int RegSetID = i + (j / 2) * WARP_ROW_MMA_TENSORS;
														
 
															+      int RegOffset = (j % 2) * (REG_PER_THREAD_C_TENSOR_16_16 / 2);
														
 
															+      int Tensor_row_offset = warp_row_offset + i * MMA_16;
														
 
															+      int Tensor_col_offset = j * MMA_8;
														
 
															+#pragma unroll
														
 
															+      for (int r = 0; r < REG_PER_THREAD_C_TENSOR_16_16 / 2; r++) {
														
 
															+        int row_offset = lane_id / 4;
														
 
															+        if (r >= 2) row_offset += 8;
														
 
															+        int col_offset = (lane_id % 4) * 2;
														
 
															+        if (r % 2 == 1) col_offset += 1;
														
 
															+        smem_CFrag[Tensor_col_offset + col_offset]
														
 
															+                  [Tensor_row_offset + row_offset] = c[RegSetID][r + RegOffset];
														
 
															+      }
														
 
															+    }
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															+#endif
														
--- a/kernels/quantization/fp6/utils_gmem.cuh
+++ b/kernels/quantization/fp6/utils_gmem.cuh
@@ -0,0 +1,94 @@
 
															+//    Copyright 2024 FP6-LLM authors
														
 
															+//
														
 
															+//    Licensed under the Apache License, Version 2.0 (the "License");
														
 
															+//    you may not use this file except in compliance with the License.
														
 
															+//    You may obtain a copy of the License at
														
 
															+//
														
 
															+//        http://www.apache.org/licenses/LICENSE-2.0
														
 
															+//
														
 
															+//    Unless required by applicable law or agreed to in writing, software
														
 
															+//    distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+//    See the License for the specific language governing permissions and
														
 
															+//    limitations under the License.
														
 
															+//
														
 
															+// This file is modified from
														
 
															+// https://github.com/usyd-fsalab/fp6_llm/blob/ce76774bcfc26b325c1b558abcf1935026d9abbc/fp6_llm/csrc/include/utils_gmem.cuh
														
 
															+
														
 
															+#ifndef UTILS_GMEM_CUH
														
 
															+#define UTILS_GMEM_CUH
														
 
															+
														
 
															+#include <assert.h>
														
 
															+#include "configs.h"
														
 
															+#include "ptx_cp.async.cuh"
														
 
															+
														
 
															+/*
														
 
															+ * Copying A1/A2 from global memory to shared memory.
														
 
															+ * Usually 1024 or 2048 Bytes
														
 
															+ */
														
 
															+template <int SMEM_SIZE_IN_BYTES_PER_WARP>
														
 
															+__device__ __forceinline__ void CopyFromGlobalToShared_A(
														
 
															+    uint32_t* SPTR, const uint4* GPTR, bool pred_guard = true) {
														
 
															+#ifdef DEBUG_MODE
														
 
															+  static_assert(SMEM_SIZE_IN_BYTES_PER_WARP / WARP_SIZE % 16 == 0);
														
 
															+#endif
														
 
															+  int lane_id = threadIdx.x % WARP_SIZE;
														
 
															+  half* SPTR_HALF = reinterpret_cast<half*>(SPTR);
														
 
															+  const half* GPTR_HALF = reinterpret_cast<const half*>(GPTR);
														
 
															+  SPTR_HALF += lane_id * 8;
														
 
															+  GPTR_HALF += lane_id * 8;
														
 
															+#pragma unroll
														
 
															+  for (int i = 0; i < SMEM_SIZE_IN_BYTES_PER_WARP / WARP_SIZE / 16; i++) {
														
 
															+    cp_async<16>(SPTR_HALF, GPTR_HALF, pred_guard);
														
 
															+    SPTR_HALF += 256;  // Forward 512 Bytes
														
 
															+    GPTR_HALF += 256;  // Forward 512 Bytes
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Copying 64 Quant Scales (FP16) from global memory to shared memory.
														
 
															+ */
														
 
															+__device__ __forceinline__ void CopyFromGlobalToShared_Scales(
														
 
															+    half* SPTR_QuantScales, const half* GPTR_A_Scales) {
														
 
															+  int lane_id = threadIdx.x % WARP_SIZE;
														
 
															+  int Offset_Shared = lane_id * 2;
														
 
															+  int Offset_Global = lane_id / 4 + (lane_id % 4) * 16;
														
 
															+  for (int i = 0; i < 2; i++)
														
 
															+    SPTR_QuantScales[Offset_Shared + i] = GPTR_A_Scales[Offset_Global + i * 8];
														
 
															+}
														
 
															+
														
 
															+// MODIFICATION NOTE: to support MSVC, half __restrict__
														
 
															+// (*SharedPTR)[WARP_K+PADDING_SHARED_MEM_FOR_B_8] is changed to below.
														
 
															+/*
														
 
															+ * (1) Copying X  rows * 64 columns of FP16 values, originally in row    major
														
 
															+ * (2) Copying 64 rows * X  columns of FP16 values, originally in column major
														
 
															+ * 16 Bytes per thread -> 512 Bytes per WARP = 4 line per WARP = 1 line per 8
														
 
															+ * Threads
														
 
															+ */
														
 
															+template <int MaxNumOfLinesToCopy, int BLOCK_WARPS>
														
 
															+__device__ __forceinline__ void CopyFromGlobalToShared(
														
 
															+    half (*__restrict__ SharedPTR)[WARP_K + PADDING_SHARED_MEM_FOR_B_8],
														
 
															+    const half* GlobalPTR, const int GlobalStride,
														
 
															+    const int NumOfLinesLeft,  // To support arbitrary N dimensions.
														
 
															+    bool Pred = true) {
														
 
															+  // static parameters: 1 Group (8 Threads) can copy 1 line (64 FP16) each time
														
 
															+  const int NumOfThreads = BLOCK_WARPS * WARP_SIZE;
														
 
															+  const int NumOfGroups = NumOfThreads / 8;
														
 
															+  const int MaxIteration = (MaxNumOfLinesToCopy - 1) / NumOfGroups + 1;
														
 
															+  // runtime variables
														
 
															+  const int line_id = threadIdx.x / 8;
														
 
															+  const int line_offset = (threadIdx.x % 8) * 8;
														
 
															+  // PTR for source global memory and target shared memory
														
 
															+  GlobalPTR += line_id * GlobalStride + line_offset;
														
 
															+  SharedPTR += line_id;
														
 
															+#pragma unroll
														
 
															+  for (int i = 0; i < MaxIteration; i++) {
														
 
															+    bool AsyncCopyPred = (line_id + i * NumOfGroups) < NumOfLinesLeft && Pred;
														
 
															+    cp_async<16>(&(*SharedPTR)[line_offset], GlobalPTR, AsyncCopyPred);
														
 
															+    //
														
 
															+    GlobalPTR += NumOfGroups * GlobalStride;
														
 
															+    SharedPTR += NumOfGroups;
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															+#endif
														
--- a/kernels/quantization/fp6/utils_parallel_dequant.cuh
+++ b/kernels/quantization/fp6/utils_parallel_dequant.cuh
@@ -0,0 +1,148 @@
 
															+//    Copyright 2024 FP6-LLM authors
														
 
															+//
														
 
															+//    Licensed under the Apache License, Version 2.0 (the "License");
														
 
															+//    you may not use this file except in compliance with the License.
														
 
															+//    You may obtain a copy of the License at
														
 
															+//
														
 
															+//        http://www.apache.org/licenses/LICENSE-2.0
														
 
															+//
														
 
															+//    Unless required by applicable law or agreed to in writing, software
														
 
															+//    distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+//    See the License for the specific language governing permissions and
														
 
															+//    limitations under the License.
														
 
															+//
														
 
															+// This file is modified from
														
 
															+// https://github.com/usyd-fsalab/fp6_llm/blob/5df6737cca32f604e957e3f63f03ccc2e4d1df0d/fp6_llm/csrc/include/utils_parallel_dequant.cuh
														
 
															+// To support MSVC, all instances of u_int32_t are changed to uint32_t.
														
 
															+
														
 
															+#ifndef UTILS_PARALLELDEQUANT_CUH
														
 
															+#define UTILS_PARALLELDEQUANT_CUH
														
 
															+
														
 
															+#include <cuda.h>
														
 
															+#include <cuda_fp16.h>
														
 
															+#include <cuda_runtime.h>
														
 
															+
														
 
															+/*
														
 
															+ * Input:   R1
														
 
															+ * Outputs: R1, R2
														
 
															+ * Note:    Simplified Exponent calculation is applied.
														
 
															+ */
														
 
															+template <int EXPONENT, int MANTISSA>
														
 
															+__device__ __forceinline__ void FPx_FP16_Cast_4Way(uint32_t* In, uint32_t* Out1,
														
 
															+                                                   uint32_t* Out2) {
														
 
															+  //
														
 
															+  constexpr int RIGHT_SHIFT = 5 - EXPONENT;
														
 
															+  constexpr int MASK1 = 0x80000000;
														
 
															+  constexpr int MASK2 = MASK1 >> EXPONENT + MANTISSA;
														
 
															+  constexpr int MASK3 = MASK2 & 0x7fffffff;
														
 
															+  constexpr int MASK = MASK3 | MASK3 >> 16;
														
 
															+  //
														
 
															+  *Out1 = *In & 0x80008000;
														
 
															+  *Out1 |= ((*In) & MASK) >> RIGHT_SHIFT;
														
 
															+  //
														
 
															+  *In = (*In) << 8;
														
 
															+  *Out2 = *In & 0x80008000;
														
 
															+  *Out2 |= ((*In) & MASK) >> RIGHT_SHIFT;
														
 
															+}
														
 
															+
														
 
															+template <int EXPONENT, int MANTISSA>
														
 
															+__device__ __forceinline__ uint32_t MultScale(uint32_t PackedFP16Pair,
														
 
															+                                              half Scale) {
														
 
															+  constexpr int BIAS_OFFSET = (int(1) << (5 - 1)) - (int(1) << (EXPONENT - 1));
														
 
															+  constexpr int BIAS = int(1) << BIAS_OFFSET;
														
 
															+  //
														
 
															+  half* FP16_1 = reinterpret_cast<half*>(&PackedFP16Pair);
														
 
															+  half* FP16_2 = FP16_1 + 1;
														
 
															+  uint32_t output;
														
 
															+  half* output_half_ptr = reinterpret_cast<half*>(&output);
														
 
															+  output_half_ptr[0] =
														
 
															+      __hmul(__hmul(*FP16_1, __float2half(1.0f * BIAS)), Scale);
														
 
															+  output_half_ptr[1] =
														
 
															+      __hmul(__hmul(*FP16_2, __float2half(1.0f * BIAS)), Scale);
														
 
															+  return output;
														
 
															+}
														
 
															+
														
 
															+// MODIFICATION NOTE: to support MSVC
														
 
															+// - u_int32_t __restrict__ Reg[][4] is changed to below.
														
 
															+// - u_int32_t __restrict__ *read_RPTR_1bit is changed to below. similarly for
														
 
															+// read_RPTR_2bit and read_RPTR_4bit
														
 
															+template <int EXPONENT, int MANTISSA>
														
 
															+__device__ __forceinline__ void Dequant_32FP6_4Way(
														
 
															+    uint32_t (*__restrict__ Reg)[4], uint32_t* __restrict__ read_RPTR_1bit,
														
 
															+    uint32_t* __restrict__ read_RPTR_2bit,
														
 
															+    uint32_t* __restrict__ read_RPTR_4bit, uint32_t* Scales) {
														
 
															+  // 1+2+4 weight split
														
 
															+  constexpr int BIT_WIDTH = 1 + EXPONENT + MANTISSA;
														
 
															+  constexpr int USE_SEG_1BIT = BIT_WIDTH & 1;
														
 
															+  constexpr int USE_SEG_2BIT = BIT_WIDTH & 2;
														
 
															+  constexpr int USE_SEG_4BIT = BIT_WIDTH & 4;
														
 
															+  //
														
 
															+  uint32_t* OutputRegs = reinterpret_cast<uint32_t*>(Reg);
														
 
															+  uint32_t* Frag_PTR_1bit = read_RPTR_1bit;
														
 
															+  uint32_t* Frag_PTR_2bit = read_RPTR_2bit;
														
 
															+  uint32_t* Frag_PTR_4bit = read_RPTR_4bit;
														
 
															+  half* Scale_RPTR = reinterpret_cast<half*>(Scales);
														
 
															+// Dequantizing 32 FP6, each Loop dequantizing 4 FP6
														
 
															+#pragma unroll(8)
														
 
															+  for (int i = 0; i < 8; i++) {
														
 
															+    uint32_t Packed_FP6 = 0;
														
 
															+    uint32_t tmp = 0;
														
 
															+    // 1bit Frag
														
 
															+    if (USE_SEG_1BIT) {
														
 
															+      tmp = (*Frag_PTR_1bit) & 0x80808080;
														
 
															+      Packed_FP6 |= tmp >> (BIT_WIDTH & 0);
														
 
															+      if (i % 8 == 7)
														
 
															+        Frag_PTR_1bit++;
														
 
															+      else
														
 
															+        (*Frag_PTR_1bit) = (*Frag_PTR_1bit) << 1;
														
 
															+    }
														
 
															+    // 2bit Frag
														
 
															+    if (USE_SEG_2BIT) {
														
 
															+      tmp = (*Frag_PTR_2bit) & 0xc0c0c0c0;
														
 
															+      Packed_FP6 |= tmp >> (BIT_WIDTH & 1);
														
 
															+      if (i % 4 == 3)
														
 
															+        Frag_PTR_2bit++;
														
 
															+      else
														
 
															+        (*Frag_PTR_2bit) = (*Frag_PTR_2bit) << 2;
														
 
															+    }
														
 
															+    // 4bit Frag2
														
 
															+    if (USE_SEG_4BIT) {
														
 
															+      tmp = (*Frag_PTR_4bit) & 0xf0f0f0f0;
														
 
															+      Packed_FP6 |= tmp >> (BIT_WIDTH & 3);
														
 
															+      if (i % 2 == 1)
														
 
															+        Frag_PTR_4bit++;
														
 
															+      else
														
 
															+        (*Frag_PTR_4bit) = (*Frag_PTR_4bit) << 4;
														
 
															+    }
														
 
															+    //
														
 
															+    uint32_t out1, out2;
														
 
															+    FPx_FP16_Cast_4Way<EXPONENT, MANTISSA>(&Packed_FP6, &out1, &out2);
														
 
															+    //
														
 
															+    *OutputRegs = MultScale<EXPONENT, MANTISSA>(
														
 
															+        out1, Scale_RPTR[0]);  // Multiply FP16 scales
														
 
															+    OutputRegs += 1;
														
 
															+    *OutputRegs = MultScale<EXPONENT, MANTISSA>(
														
 
															+        out2, Scale_RPTR[1]);  // Multiply FP16 scales
														
 
															+    OutputRegs += 1;
														
 
															+    // Updating offset for FP16 scales for every two iterations
														
 
															+    if (i % 2 == 1) Scale_RPTR += 2;
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ *
														
 
															+ */
														
 
															+__device__ __forceinline__ void ExtractFromSharedToReg_Scales(
														
 
															+    uint32_t* Scales, half* WARP_SPTR_Scales) {
														
 
															+  int lane_id = threadIdx.x % WARP_SIZE;
														
 
															+  uint32_t* SPTR_uint = reinterpret_cast<uint32_t*>(WARP_SPTR_Scales);
														
 
															+  uint32_t tmpReg = SPTR_uint[lane_id];
														
 
															+#pragma unroll
														
 
															+  for (int i = 0; i < 4; i++) {
														
 
															+    // T __shfl_sync(unsigned mask, T var, int srcLane, int width=warpSize);
														
 
															+    Scales[i] = __shfl_sync(0xffffffff, tmpReg, i, 4);
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															+#endif
														
--- a/kernels/quantization/quant_ops.h
+++ b/kernels/quantization/quant_ops.h
@@ -132,6 +132,12 @@ torch::Tensor marlin_qqq_gemm(torch::Tensor const& a,
 
															                               torch::Tensor& workspace, int64_t size_m,
														
 
															                               int64_t size_n, int64_t size_k);
														
 
															+torch::Tensor fp_eXmY_linear_forward_cuda(int64_t EXPONENT, int64_t MANTISSA,
														
 
															+                                          torch::Tensor _in_feats,
														
 
															+                                          torch::Tensor _weights,
														
 
															+                                          torch::Tensor _scales,
														
 
															+                                          int64_t splitK = 1);
														
 
															+
														
 
															 #endif
														
 
															 void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
														
--- a/kernels/torch_bindings.cpp
+++ b/kernels/torch_bindings.cpp
@@ -196,6 +196,15 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 
															   // QuIP# Decompress
														
 
															   ops.def("quip_decompress", &decompress_e8p_origorder);
														
 
															   ops.impl("quip_decompress", torch::kCUDA, &decompress_e8p_origorder);
														
 
															+
														
 
															+  // fp6_llm
														
 
															+  ops.def(
														
 
															+      "fp_eXmY_linear_forward_cuda(int EXPONENT, int MANTISSA,"
														
 
															+      "                            Tensor _in_feats, Tensor _weights,"
														
 
															+      "                            Tensor _scales, int splitK=1) -> Tensor");
														
 
															+  ops.impl("fp_eXmY_linear_forward_cuda", torch::kCUDA,
														
 
															+           &fp_eXmY_linear_forward_cuda);
														
 
															+
														
 
															 #endif
														
 
															   // Quantized GEMM for GPTQ.
														
--- a/tests/benchmarks/engine/throughput.py
+++ b/tests/benchmarks/engine/throughput.py
@@ -66,6 +66,7 @@ def run_aphrodite(
 
															     model: str,
														
 
															     tokenizer: str,
														
 
															     quantization: Optional[str],
														
 
															+    quant_llm_fp_bits: Optional[int],
														
 
															     tensor_parallel_size: int,
														
 
															     seed: int,
														
 
															     n: int,
														
@@ -90,6 +91,7 @@ def run_aphrodite(
 
															         model=model,
														
 
															         tokenizer=tokenizer,
														
 
															         quantization=quantization,
														
 
															+        quant_llm_fp_bits=quant_llm_fp_bits,
														
 
															         tensor_parallel_size=tensor_parallel_size,
														
 
															         seed=seed,
														
 
															         trust_remote_code=trust_remote_code,
														
@@ -226,6 +228,7 @@ def main(args: argparse.Namespace):
 
															     if args.backend == "aphrodite":
														
 
															         elapsed_time = run_aphrodite(
														
 
															             requests, args.model, args.tokenizer, args.quantization,
														
 
															+            args.quant_llm_fp_bits,
														
 
															             args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
														
 
															             args.trust_remote_code, args.dtype, args.max_model_len,
														
 
															             args.enforce_eager, args.kv_cache_dtype,
														
@@ -286,6 +289,12 @@ if __name__ == "__main__":
 
															                         '-q',
														
 
															                         choices=[*QUANTIZATION_METHODS, None],
														
 
															                         default=None)
														
 
															+    parser.add_argument('--quant-llm-fp-bits',
														
 
															+                        type=int,
														
 
															+                        default=None,
														
 
															+                        choices=[4, 5, 6, 7],
														
 
															+                        help="Number of bits for the FP quantization in "
														
 
															+                        "QuantLLM")
														
 
															     parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
														
 
															     parser.add_argument("--n",
														
 
															                         type=int,