123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402 |
- #pragma once
- #include <cassert>
- #include <cstdint>
- #include <cuda_bf16.h>
- #include <cuda_fp16.h>
- #include <type_traits>
- namespace aphrodite {
- namespace autoquant {
- #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750))
- #define APHRODITE_ARCH_SM75 1
- #else
- #define APHRODITE_ARCH_SM75 0
- #endif
- #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
- #define APHRODITE_ARCH_SM80 1
- #else
- #define APHRODITE_ARCH_SM80 0
- #endif
- constexpr int WARP_SIZE = 32;
- #if defined(__CUDA_ARCH__) && !defined(__INTELLISENSE__)
- #if defined(__CUDACC_RTC__) || (defined(__clang__) && defined(__CUDA__))
- #define PRAGMA_UNROLL _Pragma("unroll")
- #define PRAGMA_NO_UNROLL _Pragma("unroll 1")
- #else
- #define PRAGMA_UNROLL #pragma unroll
- #define PRAGMA_NO_UNROLL #pragma unroll 1
- #endif
- #else
- #define PRAGMA_UNROLL
- #define PRAGMA_NO_UNROLL
- #endif
- static const float HALF_FLT_MAX = 65504.F;
- __inline__ __device__ uint4 dequantize_s4_to_fp16x2(uint32_t const& source)
- {
- uint4 result;
- uint32_t* h = reinterpret_cast<uint32_t*>(&result);
- uint32_t const i4s = reinterpret_cast<uint32_t const&>(source);
-
- static constexpr uint32_t immLut = (0xf0 & 0xcc) | 0xaa;
- static constexpr uint32_t BOTTOM_MASK = 0x000f000f;
- static constexpr uint32_t TOP_MASK = 0x00f000f0;
- static constexpr uint32_t I4s_TO_F16s_MAGIC_NUM = 0x64006400;
-
-
-
-
-
-
-
-
- const uint32_t top_i4s = i4s >> 8;
-
- asm("lop3.b32 %0, %1, %2, %3, %4;\n"
- : "=r"(h[0])
- : "r"(i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
-
- asm("lop3.b32 %0, %1, %2, %3, %4;\n"
- : "=r"(h[1])
- : "r"(i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
-
- asm("lop3.b32 %0, %1, %2, %3, %4;\n"
- : "=r"(h[2])
- : "r"(top_i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
-
- asm("lop3.b32 %0, %1, %2, %3, %4;\n"
- : "=r"(h[3])
- : "r"(top_i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
-
-
-
-
-
-
- static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64006400;
-
- static constexpr uint32_t ONE_SIXTEENTH = 0x2c002c00;
-
-
-
- static constexpr uint32_t NEG_64 = 0xd400d400;
-
-
- asm("sub.f16x2 %0, %1, %2;\n" : "=r"(h[0]) : "r"(h[0]), "r"(FP16_TOP_MAGIC_NUM));
-
- asm("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[1]) : "r"(h[1]), "r"(ONE_SIXTEENTH), "r"(NEG_64));
-
- asm("sub.f16x2 %0, %1, %2;\n" : "=r"(h[2]) : "r"(h[2]), "r"(FP16_TOP_MAGIC_NUM));
-
- asm("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[3]) : "r"(h[3]), "r"(ONE_SIXTEENTH), "r"(NEG_64));
- return result;
- }
- __inline__ __device__ uint4 dequantize_s4_to_fp16x2_v2(uint32_t const& source)
- {
- uint4 result;
- uint32_t* h = reinterpret_cast<uint32_t*>(&result);
- uint32_t const& i4s = reinterpret_cast<uint32_t const&>(source);
-
- static constexpr uint32_t immLut = (0xf0 & 0xcc) | 0xaa;
- static constexpr uint32_t BOT_MASK = 0x000f000f;
- static constexpr uint32_t TOP_MASK = 0x00f000f0;
- static constexpr uint32_t MAGIC_NUM_0 = 0x64006400;
- static constexpr uint32_t MAGIC_NUM_1 = 0x54005400;
- static constexpr uint32_t MAGIC_NUM_2 = MAGIC_NUM_1 >> 4;
-
-
- const uint32_t top_i4s = i4s >> 8;
- if (0) {
- asm("lop3.b32 %0, %1, %2, %3, %4;\n" : "=r"(h[0]) : "r"(i4s), "n"(BOT_MASK), "n"(MAGIC_NUM_0), "n"(immLut));
- asm("lop3.b32 %0, %1, %2, %3, %4;\n" : "=r"(h[1]) : "r"(i4s), "n"(TOP_MASK), "n"(MAGIC_NUM_1), "n"(immLut));
- asm("lop3.b32 %0, %1, %2, %3, %4;\n" : "=r"(h[2]) : "r"(top_i4s), "n"(BOT_MASK), "n"(MAGIC_NUM_0), "n"(immLut));
- asm("lop3.b32 %0, %1, %2, %3, %4;\n" : "=r"(h[3]) : "r"(top_i4s), "n"(TOP_MASK), "n"(MAGIC_NUM_1), "n"(immLut));
- asm("sub.f16x2 %0, %1, %2;\n" : "=r"(h[0]) : "r"(h[0]), "r"(MAGIC_NUM_0));
- asm("sub.f16x2 %0, %1, %2;\n" : "=r"(h[1]) : "r"(h[1]), "r"(MAGIC_NUM_1));
- asm("sub.f16x2 %0, %1, %2;\n" : "=r"(h[2]) : "r"(h[2]), "r"(MAGIC_NUM_0));
- asm("sub.f16x2 %0, %1, %2;\n" : "=r"(h[3]) : "r"(h[3]), "r"(MAGIC_NUM_1));
- }
- else {
- asm("lop3.b32 %0, %1, %2, %3, %4;\n" : "=r"(h[0]) : "r"(i4s), "n"(BOT_MASK), "n"(MAGIC_NUM_2), "n"(immLut));
- asm("lop3.b32 %0, %1, %2, %3, %4;\n" : "=r"(h[1]) : "r"(i4s), "n"(TOP_MASK), "n"(MAGIC_NUM_1), "n"(immLut));
- asm("lop3.b32 %0, %1, %2, %3, %4;\n" : "=r"(h[2]) : "r"(top_i4s), "n"(BOT_MASK), "n"(MAGIC_NUM_2), "n"(immLut));
- asm("lop3.b32 %0, %1, %2, %3, %4;\n" : "=r"(h[3]) : "r"(top_i4s), "n"(TOP_MASK), "n"(MAGIC_NUM_1), "n"(immLut));
- h[0] <<= 4;
- h[2] <<= 4;
-
-
- }
- return result;
- }
- __inline__ __device__ uint4 dequantize_s4_to_bf16x2_v2(uint32_t const& source)
- {
- uint4 result;
- uint32_t* h = reinterpret_cast<uint32_t*>(&result);
- uint32_t const& source_i4s = reinterpret_cast<uint32_t const&>(source);
-
- static constexpr uint32_t immLut = (0xf0 & 0xcc) | 0xaa;
- static constexpr uint32_t MASK = 0x000f000f;
- static constexpr uint32_t I4s_TO_BF16s_MAGIC_NUM = 0x43004300;
-
-
- uint32_t i4s = source_i4s;
- asm ("lop3.b32 %0, %1, %2, %3, %4;\n"
- : "=r"(h[0])
- : "r"(i4s), "n"(MASK), "n"(I4s_TO_BF16s_MAGIC_NUM), "n"(immLut));
- PRAGMA_UNROLL
- for (int ii = 1; ii < 4; ++ii)
- {
- i4s >>= 4;
-
- asm("lop3.b32 %0, %1, %2, %3, %4;\n"
- : "=r"(h[ii])
- : "r"(i4s), "n"(MASK), "n"(I4s_TO_BF16s_MAGIC_NUM), "n"(immLut));
- }
- return result;
- }
- __inline__ __device__ uint32_t cast_smem_ptr_to_uint(void const* const ptr)
- {
- uint32_t smem_int_ptr;
- asm("{.reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %1; cvt.u32.u64 %0, smem_ptr; }\n"
- : "=r"(smem_int_ptr)
- : "l"(ptr));
- return smem_int_ptr;
- }
- __inline__ __device__ void ldmatrix_m8n8_x4_b16(uint& d0, uint& d1, uint& d2, uint& d3, uint32_t smem_int_ptr)
- {
- #if APHRODITE_ARCH_SM75
- asm("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n"
- : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
- : "r"(smem_int_ptr));
- #else
- assert(APHRODITE_ARCH_SM75);
- #endif
- }
- __inline__ __device__ void ldmatrix_m8n8_x2_b16(uint& d0, uint& d1, uint32_t smem_int_ptr)
- {
- #if APHRODITE_ARCH_SM75
- asm("ldmatrix.sync.aligned.m8n8.x2.shared.b16 {%0,%1}, [%2];\n" : "=r"(d0), "=r"(d1) : "r"(smem_int_ptr));
- #else
- assert(APHRODITE_ARCH_SM75);
- #endif
- }
- __inline__ __device__ void wait_flag(int* lock, int status, int thread_id)
- {
- int state = 0;
- while (__syncthreads_and(state != status)) {
- if (thread_id == 0) {
- #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
- asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n" : "=r"(state) : "l"(lock));
- #else
- asm volatile("ld.global.cg.b32 %0, [%1];\n" : "=r"(state) : "l"(lock));
- #endif
- }
- }
- __syncthreads();
- }
- __inline__ __device__ void release_flag(int* lock, int status, int thread_id)
- {
- __syncthreads();
- if (thread_id == 0) {
- #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
- asm volatile("st.global.release.gpu.b32 [%0], %1;\n" : : "l"(lock), "r"(status));
- #else
- asm volatile("st.global.cg.b32 [%0], %1;\n" : : "l"(lock), "r"(status));
- #endif
- }
- }
- template <typename T>
- __inline__ __device__ T apply_Q(const T& x, const T& q);
- template <>
- __inline__ __device__ half2 apply_Q(const half2& x, const half2& q)
- {
- uint s, z;
- (half2&)z = __halves2half2(q.x, q.x);
- (half2&)s = __halves2half2(q.y, q.y);
- auto& t = (const uint&)x;
- uint u, v;
- asm("sub.ftz.f16x2 %0, %1, %2;\n" : "=r"(u) : "r"(t), "r"(z));
- asm("mul.ftz.f16x2 %0, %1, %2;\n" : "=r"(v) : "r"(u), "r"(s));
- return (half2&)v;
- }
- inline __device__ __nv_bfloat162 bf16hsub2(const __nv_bfloat162 a, const __nv_bfloat162 b) {
- #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
- assert(false);
- #else
- return __hsub2(a, b);
- #endif
- }
- inline __device__ __nv_bfloat162 bf16hmul2(const __nv_bfloat162 a, const __nv_bfloat162 b) {
- #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
- assert(false);
- #else
- return __hmul2(a, b);
- #endif
- }
- inline __device__ __nv_bfloat162 halves2bfloat162(const __nv_bfloat16 a, const __nv_bfloat16 b){
- #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
- assert(false);
- #else
- return __halves2bfloat162(a, b);
- #endif
- }
- inline __device__ float2 bfloat1622float2(const __nv_bfloat162 a){
- #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
- assert(false);
- #else
- return __bfloat1622float2(a);
- #endif
- }
- inline __device__ __nv_bfloat162 float22bfloat162_rn(const float2 a){
- #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
- assert(false);
- #else
- return __float22bfloat162_rn(a);
- #endif
- }
- template <>
- __inline__ __device__ __nv_bfloat162 apply_Q(const __nv_bfloat162& x, const __nv_bfloat162& q)
- {
- __nv_bfloat162 s, z;
- (__nv_bfloat162&)z = halves2bfloat162(q.x, q.x);
- (__nv_bfloat162&)s = halves2bfloat162(q.y, q.y);
- __nv_bfloat162 u, v;
- u = bf16hsub2(x, z);
- v = bf16hmul2(u, s);
- return v;
- }
- __device__ __forceinline__ float clamp_inf_for_half(const float input)
- {
-
- return input > 0.0f ? min(input, (HALF_FLT_MAX - 1000) / 2.0) : max(input, (-HALF_FLT_MAX + 1000) / 2.0);
- }
- template<typename T, int N>
- struct Array {
- T a[N];
- __device__ __host__ constexpr T& operator[](int i) noexcept
- {
- return a[i];
- }
- __device__ __host__ constexpr const T& operator[](int i) const noexcept
- {
- return a[i];
- }
- };
- template<int... Ns>
- struct Shape {
- static constexpr Array<int, sizeof...(Ns)> data_{Ns...};
- constexpr Shape() = default;
- Shape(std::integral_constant<int, Ns>...){};
- template<int index>
- constexpr auto get() const noexcept
- {
- return std::integral_constant<int, data_[index]>{};
- }
- constexpr auto m() const noexcept
- {
- return get<0>();
- }
- constexpr auto n() const noexcept
- {
- return get<1>();
- }
- constexpr auto k() const noexcept
- {
- return get<2>();
- }
- constexpr int c() const noexcept
- {
- return get<0>();
- }
- constexpr int s() const noexcept
- {
- return get<1>();
- }
- constexpr int count() const noexcept
- {
- return (Ns * ...);
- }
- };
- }
- }
|