|
@@ -1,7 +1,9 @@
|
|
|
#include <cuda_bf16.h>
|
|
|
#include <cuda_fp16.h>
|
|
|
#include <cuda_runtime.h>
|
|
|
-#include <mma.h>
|
|
|
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 700
|
|
|
+ #include <mma.h>
|
|
|
+#endif
|
|
|
|
|
|
#include <ATen/ATen.h>
|
|
|
#include <ATen/core/Tensor.h>
|
|
@@ -103,6 +105,7 @@ static __device__ void store(
|
|
|
int32_t nTile,
|
|
|
int32_t laneId,
|
|
|
const float4& out) {
|
|
|
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 800
|
|
|
|
|
|
// sum.x / sum.y are written at
|
|
|
// [laneId / 4], [(laneId % 4) * 2, (laneId % 4) * 2 + 1]
|
|
@@ -126,6 +129,7 @@ static __device__ void store(
|
|
|
if (outRow + 8 < m) {
|
|
|
*reinterpret_cast<half2*>(cPtr + 8 * n) = v23;
|
|
|
}
|
|
|
+#endif
|
|
|
}
|
|
|
};
|
|
|
|
|
@@ -144,6 +148,7 @@ static __device__ void load(
|
|
|
int32_t kTileStart,
|
|
|
int32_t laneId,
|
|
|
f16x2x2_u32 b[KTilesPerIteration]) {
|
|
|
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 800
|
|
|
auto Bptr = reinterpret_cast<const uint8_t*>(B);
|
|
|
#pragma unroll
|
|
|
for (int i = 0; i < KTilesPerIteration; ++i) {
|
|
@@ -151,6 +156,7 @@ static __device__ void load(
|
|
|
const int col = (kTileStart + i) * kKTileSize / 4 + laneId % 4;
|
|
|
*(reinterpret_cast<uint64_t*>(b[i].vals)) = CB[Bptr[row * k/4 + col]];
|
|
|
}
|
|
|
+#endif
|
|
|
}
|
|
|
};
|
|
|
|
|
@@ -169,6 +175,7 @@ static __device__ void load(
|
|
|
int32_t kTileStart,
|
|
|
int32_t laneId,
|
|
|
f16x2x2_u32 b[KTilesPerIteration]) {
|
|
|
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 800
|
|
|
auto Bptr = reinterpret_cast<const uint32_t*>(B);
|
|
|
#pragma unroll
|
|
|
for (int i = 0; i < KTilesPerIteration; ++i) {
|
|
@@ -189,6 +196,7 @@ static __device__ void load(
|
|
|
*(half2*)(b[i].vals) = __hfma2(*((half2*)(&q0)), y16, z16);
|
|
|
*(half2*)(b[i].vals+1) = __hfma2(*((half2*)(&q1)), y16, z16);
|
|
|
}
|
|
|
+#endif
|
|
|
}
|
|
|
};
|
|
|
|
|
@@ -199,6 +207,7 @@ __device__ static inline uint64_t decode8weights(
|
|
|
uint16_t weight_compressed,
|
|
|
const int64_t *__restrict__ codebook_abs
|
|
|
) {
|
|
|
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 800
|
|
|
|
|
|
uint8_t bits_sign = weight_compressed & 0xff;
|
|
|
uint8_t parity = __popc(bits_sign) & 1;
|
|
@@ -215,6 +224,7 @@ __device__ static inline uint64_t decode8weights(
|
|
|
packed -= parity * 0x0202020202020202;
|
|
|
|
|
|
return packed;
|
|
|
+#endif
|
|
|
}
|
|
|
|
|
|
__device__ static inline uint32_t decode8weights(
|
|
@@ -222,6 +232,7 @@ __device__ static inline uint32_t decode8weights(
|
|
|
const int64_t *__restrict__ codebook_abs,
|
|
|
int idx
|
|
|
) {
|
|
|
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 800
|
|
|
uint8_t bits_sign = weight_compressed & 0xff; //__brev(weight_compressed) >> 24;
|
|
|
const uint32_t magic_nums[2] = {0x08040201ll, 0x80402010ll};
|
|
|
uint8_t parity = __popc(bits_sign) & 1;
|
|
@@ -237,6 +248,7 @@ __device__ static inline uint32_t decode8weights(
|
|
|
packed |= 0x01010101;
|
|
|
packed -= parity * 0x02020202;
|
|
|
return packed;
|
|
|
+#endif
|
|
|
};
|
|
|
|
|
|
template <int KTilesPerIteration>
|
|
@@ -251,6 +263,7 @@ static __device__ void load(
|
|
|
int32_t kTileStart,
|
|
|
int32_t laneId,
|
|
|
f16x2x2_u32 b[KTilesPerIteration]) {
|
|
|
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 800
|
|
|
auto Bptr = (const uint16_t*) B;
|
|
|
#pragma unroll
|
|
|
for (int i = 0; i < KTilesPerIteration; ++i) {
|
|
@@ -275,6 +288,7 @@ static __device__ void load(
|
|
|
//*((half*)(b[i].vals) + 2) = unpacked[1].x;
|
|
|
//*((half*)(b[i].vals) + 3) = unpacked[1].y;
|
|
|
}
|
|
|
+#endif
|
|
|
}
|
|
|
};
|
|
|
|
|
@@ -304,6 +318,7 @@ __launch_bounds__(256) void tinygemm_m16n8k16_chunk_kernel(
|
|
|
int32_t mTiles,
|
|
|
int32_t nTiles,
|
|
|
int32_t kTiles) {
|
|
|
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 800
|
|
|
__shared__ uint64_t CB_[256];
|
|
|
if (BLayout::use_codebook) {
|
|
|
CB_[threadIdx.x + threadIdx.y * 32] = CB[threadIdx.x + threadIdx.y * 32];
|
|
@@ -444,12 +459,14 @@ __launch_bounds__(256) void tinygemm_m16n8k16_chunk_kernel(
|
|
|
laneId,
|
|
|
sum_f32);
|
|
|
}
|
|
|
+#endif
|
|
|
}
|
|
|
|
|
|
at::Tensor d4_mm_origorder(
|
|
|
const at::Tensor& A,
|
|
|
const at::Tensor& B,
|
|
|
const at::Tensor& CB) {
|
|
|
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 800
|
|
|
c10::cuda::CUDAGuard g(A.device());
|
|
|
auto stream = at::cuda::getCurrentCUDAStream();
|
|
|
|
|
@@ -488,12 +505,14 @@ at::Tensor d4_mm_origorder(
|
|
|
kTiles);
|
|
|
|
|
|
return C_final;
|
|
|
+#endif
|
|
|
}
|
|
|
|
|
|
at::Tensor e8p_mm_origorder(
|
|
|
const at::Tensor& A,
|
|
|
const at::Tensor& B,
|
|
|
const at::Tensor& CB) {
|
|
|
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 800
|
|
|
c10::cuda::CUDAGuard g(A.device());
|
|
|
auto stream = at::cuda::getCurrentCUDAStream();
|
|
|
|
|
@@ -531,11 +550,13 @@ at::Tensor e8p_mm_origorder(
|
|
|
kTiles);
|
|
|
|
|
|
return C_final;
|
|
|
+#endif
|
|
|
}
|
|
|
|
|
|
at::Tensor hi_mm_origorder(
|
|
|
const at::Tensor& A,
|
|
|
const at::Tensor& B) {
|
|
|
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 800
|
|
|
c10::cuda::CUDAGuard g(A.device());
|
|
|
auto stream = at::cuda::getCurrentCUDAStream();
|
|
|
|
|
@@ -573,6 +594,7 @@ at::Tensor hi_mm_origorder(
|
|
|
kTiles);
|
|
|
|
|
|
return C_final;
|
|
|
+#endif
|
|
|
}
|
|
|
|
|
|
#define DECOMPRESS_D4_BLOCK_SIZE 256
|
|
@@ -582,12 +604,14 @@ __global__ void cuda_decompress_d4_origorder_kernel(
|
|
|
const c10::Half* __restrict__ CB, // 256 x 4
|
|
|
c10::Half* __restrict__ Y // m x n
|
|
|
) {
|
|
|
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 800
|
|
|
const long i = threadIdx.x + DECOMPRESS_D4_BLOCK_SIZE * blockIdx.x;
|
|
|
|
|
|
for(long r = 0; r < 4; r++) {
|
|
|
uint8_t yidx = ((uint8_t*)YIs)[i*4 + r];
|
|
|
((uint64_t*)Y)[i*4 + r] = ((uint64_t*)CB)[yidx & 255];
|
|
|
}
|
|
|
+#endif
|
|
|
}
|
|
|
|
|
|
|
|
@@ -596,6 +620,7 @@ void decompress_d4_origorder(
|
|
|
torch::Tensor CB, // 256 x 4
|
|
|
torch::Tensor Y // m x n
|
|
|
) {
|
|
|
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 800
|
|
|
size_t m = Y.sizes()[0];
|
|
|
size_t n = Y.sizes()[1];
|
|
|
|
|
@@ -615,6 +640,7 @@ void decompress_d4_origorder(
|
|
|
CB.data_ptr<c10::Half>(),
|
|
|
Y.data_ptr<c10::Half>()
|
|
|
);
|
|
|
+#endif
|
|
|
}
|
|
|
|
|
|
#define DECOMPRESS_E8P_BLOCK_SIZE 256
|
|
@@ -624,6 +650,7 @@ __global__ void cuda_decompress_e8p_origorder_kernel(
|
|
|
const int64_t* __restrict__ CB, // 256 x 8
|
|
|
c10::Half* __restrict__ Y // m x n
|
|
|
) {
|
|
|
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 800
|
|
|
const long i = threadIdx.x + DECOMPRESS_E8P_BLOCK_SIZE * blockIdx.x;
|
|
|
uint16_t yidx = ((uint16_t*)YIs)[i];
|
|
|
uint64_t decoded = BLayout_E8::decode8weights(yidx, CB);
|
|
@@ -643,6 +670,7 @@ __global__ void cuda_decompress_e8p_origorder_kernel(
|
|
|
((__half2*)Y)[i*4+2] = __hadd2(unpacked[0][1], adjust); // 45
|
|
|
((__half2*)Y)[i*4+1] = __hadd2(unpacked[1][0], adjust); // 23
|
|
|
((__half2*)Y)[i*4+3] = __hadd2(unpacked[1][1], adjust); // 67
|
|
|
+#endif
|
|
|
}
|
|
|
|
|
|
|
|
@@ -651,6 +679,7 @@ void decompress_e8p_origorder(
|
|
|
torch::Tensor CB, // 256 x 8
|
|
|
torch::Tensor &Y // m x n
|
|
|
) {
|
|
|
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 800
|
|
|
size_t m = Y.sizes()[0];
|
|
|
size_t n = Y.sizes()[1];
|
|
|
|
|
@@ -670,6 +699,7 @@ void decompress_e8p_origorder(
|
|
|
CB.data_ptr<int64_t>(),
|
|
|
Y.data_ptr<c10::Half>()
|
|
|
);
|
|
|
+#endif
|
|
|
}
|
|
|
|
|
|
#define DECOMPRESS_HI_BLOCK_SIZE 256
|
|
@@ -678,6 +708,7 @@ __global__ void cuda_decompress_hi_origorder_kernel(
|
|
|
const uint32_t* __restrict__ YIs, // m x (n/8)
|
|
|
c10::Half* __restrict__ Y // m x n
|
|
|
) {
|
|
|
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 800
|
|
|
const long i = threadIdx.x + DECOMPRESS_HI_BLOCK_SIZE * blockIdx.x;
|
|
|
uint32_t qa = YIs[i];
|
|
|
|
|
@@ -697,12 +728,14 @@ __global__ void cuda_decompress_hi_origorder_kernel(
|
|
|
((__half2*)Y)[i*4+1] = __hfma2(*((half2*)(&q1)), y16, z16);
|
|
|
((__half2*)Y)[i*4+2] = __hfma2(*((half2*)(&q2)), y16, z16);
|
|
|
((__half2*)Y)[i*4+3] = __hfma2(*((half2*)(&q3)), y16, z16);
|
|
|
+#endif
|
|
|
}
|
|
|
|
|
|
void decompress_hi_origorder(
|
|
|
torch::Tensor YIs, // m x (n/8)
|
|
|
torch::Tensor Y // m x n
|
|
|
){
|
|
|
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 800
|
|
|
size_t m = Y.sizes()[0];
|
|
|
size_t n = Y.sizes()[1];
|
|
|
|
|
@@ -719,4 +752,5 @@ void decompress_hi_origorder(
|
|
|
(uint32_t*)YIs.data_ptr<int32_t>(),
|
|
|
Y.data_ptr<c10::Half>()
|
|
|
);
|
|
|
+#endif
|
|
|
}
|