123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654 |
- #include <torch/all.h>
- #include <torch/python.h>
- #include <cuda.h>
- #include <cuda_runtime.h>
- #include <cuda_fp16.h>
- // atomicAdd for double-precision floating-point numbers on hardware with
- // compute capability < 6.0 from:
- // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions
- // #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
- // __device__ double atomicAdd(
- // double* address,
- // double val
- // ) {
- // unsigned long long int* address_as_ull = (unsigned long long int*)address;
- // unsigned long long int old = *address_as_ull, assumed;
- //
- // do {
- // assumed = old;
- // old = atomicCAS(
- // address_as_ull,
- // assumed,
- // __double_as_longlong(val + __longlong_as_double(assumed))
- // );
- //
- // // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
- // } while (assumed != old);
- //
- // return __longlong_as_double(old);
- // }
- // #endif
- #if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700) || defined(USE_ROCM)
- // adapted from https://github.com/torch/cutorch/blob/master/lib/THC/THCAtomics.cuh
- __device__ __forceinline__ void atomicAdd(c10::Half* address, c10::Half val) {
- unsigned int *address_as_ui = reinterpret_cast<unsigned int *>(reinterpret_cast<char *>(address) - (reinterpret_cast<size_t>(address) & 2));
- unsigned int old = *address_as_ui;
- unsigned int assumed;
- do {
- assumed = old;
- unsigned short hsum = reinterpret_cast<size_t>(address) & 2 ? (old >> 16) : (old & 0xffff);
- hsum += val;
- old = reinterpret_cast<size_t>(address) & 2
- ? (old & 0xffff) | (hsum << 16)
- : (old & 0xffff0000) | hsum;
- old = atomicCAS(address_as_ui, assumed, old);
- // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
- } while (assumed != old);
- }
- __device__ __forceinline__ void atomicAdd(__half* address, c10::Half val) {
- unsigned int * address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2));
- unsigned int old = *address_as_ui;
- unsigned int assumed;
- do {
- assumed = old;
- __half_raw hsum;
- hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
- half tmpres = __hadd(hsum, val);
- hsum = __half_raw(tmpres);
- old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
- old = atomicCAS(address_as_ui, assumed, old);
- } while (assumed != old);
- }
- #endif
- template <typename scalar_t>
- __global__ void VecQuant2MatMulKernel(
- const scalar_t* __restrict__ vec,
- const int* __restrict__ mat,
- scalar_t* __restrict__ mul,
- const scalar_t* __restrict__ scales,
- const int* __restrict__ zeros,
- const int* __restrict__ g_idx,
- int batch,
- int vec_height,
- int height,
- int width,
- int zero_width
- );
- template <typename scalar_t>
- __global__ void VecQuant3MatMulKernel(
- const scalar_t* __restrict__ vec,
- const int* __restrict__ mat,
- scalar_t* __restrict__ mul,
- const scalar_t* __restrict__ scales,
- const int* __restrict__ zeros,
- const int* __restrict__ g_idx,
- int batch,
- int vec_height,
- int height,
- int width,
- int zero_width
- );
- template <typename scalar_t>
- __global__ void VecQuant4MatMulKernel(
- const scalar_t* __restrict__ vec,
- const int* __restrict__ mat,
- scalar_t* __restrict__ mul,
- const scalar_t* __restrict__ scales,
- const int* __restrict__ zeros,
- const int* __restrict__ g_idx,
- int batch,
- int vec_height,
- int height,
- int width,
- int zero_width
- );
- template <typename scalar_t>
- __global__ void VecQuant8MatMulKernel(
- const scalar_t* __restrict__ vec,
- const int* __restrict__ mat,
- scalar_t* __restrict__ mul,
- const scalar_t* __restrict__ scales,
- const int* __restrict__ zeros,
- const int* __restrict__ g_idx,
- int batch,
- int vec_height,
- int height,
- int width,
- int zero_width
- );
- template <typename scalar_t>
- __global__ void VecQuant2MatMulKernel_old(
- const scalar_t* __restrict__ vec,
- const int* __restrict__ mat,
- scalar_t* __restrict__ mul,
- const scalar_t* __restrict__ scales,
- const int* __restrict__ zeros,
- int batch,
- int vec_height,
- int height,
- int width,
- int zero_width,
- int groupsize
- );
- template <typename scalar_t>
- __global__ void VecQuant3MatMulKernel_old(
- const scalar_t* __restrict__ vec,
- const int* __restrict__ mat,
- scalar_t* __restrict__ mul,
- const scalar_t* __restrict__ scales,
- const int* __restrict__ zeros,
- int batch,
- int vec_height,
- int height,
- int width,
- int zero_width,
- int groupsize
- );
- template <typename scalar_t>
- __global__ void VecQuant4MatMulKernel_old(
- const scalar_t* __restrict__ vec,
- const int* __restrict__ mat,
- scalar_t* __restrict__ mul,
- const scalar_t* __restrict__ scales,
- const int* __restrict__ zeros,
- int batch,
- int vec_height,
- int height,
- int width,
- int zero_width,
- int groupsize
- );
- template <typename scalar_t>
- __global__ void VecQuant8MatMulKernel_old(
- const scalar_t* __restrict__ vec,
- const int* __restrict__ mat,
- scalar_t* __restrict__ mul,
- const scalar_t* __restrict__ scales,
- const int* __restrict__ zeros,
- int batch,
- int vec_height,
- int height,
- int width,
- int zero_width,
- int groupsize
- );
- __global__ void VecQuant2MatMulKernelFaster_old(
- const half2* __restrict__ vec,
- const int* __restrict__ mat,
- float* __restrict__ mul,
- const float* __restrict__ scales,
- const int* __restrict__ zeros,
- int batch,
- int vec_height,
- int height,
- int width,
- int zero_width,
- int groupsize
- );
- __global__ void VecQuant3MatMulKernelFaster_old(
- const half2* __restrict__ vec,
- const int* __restrict__ mat,
- float* __restrict__ mul,
- const float* __restrict__ scales,
- const int* __restrict__ zeros,
- int batch,
- int vec_height,
- int height,
- int width,
- int zero_width,
- int groupsize
- );
- __global__ void VecQuant4MatMulKernelFaster_old(
- const half2* __restrict__ vec,
- const int* __restrict__ mat,
- float* __restrict__ mul,
- const float* __restrict__ scales,
- const int* __restrict__ zeros,
- int batch,
- int vec_height,
- int height,
- int width,
- int zero_width,
- int groupsize
- );
- const int BLOCKWIDTH = 256;
- const int BLOCKHEIGHT2 = 16;
- const int BLOCKHEIGHT3 = 24;
- const int BLOCKHEIGHT4 = 32;
- const int BLOCKHEIGHT8 = 64;
- __device__ inline unsigned int as_unsigned(int i) {
- return *reinterpret_cast<unsigned int*>(&i);
- }
- __device__ inline int as_int(int i) {
- return *reinterpret_cast<int*>(&i);
- }
- void vecquant2matmul_cuda(
- torch::Tensor vec,
- torch::Tensor mat,
- torch::Tensor mul,
- torch::Tensor scales,
- torch::Tensor zeros,
- torch::Tensor g_idx
- ) {
- int batch = vec.size(0);
- int vec_height = vec.size(1);
- int height = mat.size(0);
- int width = mat.size(1);
- int zero_width = zeros.size(1);
- dim3 blocks(
- (height + BLOCKHEIGHT2 - 1) / BLOCKHEIGHT2,
- (width + BLOCKWIDTH - 1) / BLOCKWIDTH
- );
- dim3 threads(BLOCKWIDTH);
- AT_DISPATCH_FLOATING_TYPES(
- vec.type(), "vecquant2matmul_cuda", ([&] {
- VecQuant2MatMulKernel<<<blocks, threads>>>(
- vec.data<scalar_t>(), mat.data<int>(), mul.data<scalar_t>(),
- scales.data<scalar_t>(), zeros.data<int>(), g_idx.data<int>(),
- batch, vec_height, height, width, zero_width
- );
- })
- );
- }
- template <typename scalar_t>
- __global__ void VecQuant2MatMulKernel(
- const scalar_t* __restrict__ vec,
- const int* __restrict__ mat,
- scalar_t* __restrict__ mul,
- const scalar_t* __restrict__ scales,
- const int* __restrict__ zeros,
- const int* __restrict__ g_idx,
- int batch,
- int vec_height,
- int height,
- int width,
- int zero_width
- ) {
- int h = BLOCKHEIGHT2 * blockIdx.x;
- int w = BLOCKWIDTH * blockIdx.y + threadIdx.x;
- __shared__ scalar_t blockvec[BLOCKWIDTH];
- int i = width * h + w;
- int g_h = h * 16;
- int k;
- unsigned int g;
- scalar_t w_tmp;
- int z_w = w / 16;
- int z_mod = (w % 16) * 2;
- float weight[BLOCKWIDTH];
- for (k = 0; k < BLOCKWIDTH; ++k){
- int k_w = (k / 16);
- int k_bit = (k % 16) * 2;
- g = as_int(g_idx[g_h + k]);
- scalar_t scale = scales[g * width + w];
- scalar_t zero = scalar_t((as_unsigned(zeros[g * zero_width + z_w]) >> z_mod & 0x3) + 1);
- w_tmp = ((as_unsigned(mat[i + (k_w * width)]) >> k_bit) & 0x3);
- weight[k] = scale * (w_tmp - zero);
- }
- scalar_t res;
- for (int b = 0; b < batch; ++b){
- res = 0;
- blockvec[threadIdx.x] = vec[b * vec_height + blockIdx.x * BLOCKWIDTH + threadIdx.x];
- __syncthreads();
- for (k = 0; k < BLOCKWIDTH; ++k){
- res += weight[k] * blockvec[k];
- }
- atomicAdd(&mul[b * width + w], res);
- __syncthreads();
- }
- }
- void vecquant3matmul_cuda(
- torch::Tensor vec,
- torch::Tensor mat,
- torch::Tensor mul,
- torch::Tensor scales,
- torch::Tensor zeros,
- torch::Tensor g_idx
- ) {
- int batch = vec.size(0);
- int vec_height = vec.size(1);
- int height = mat.size(0);
- int width = mat.size(1);
- int zero_width = zeros.size(1);
- dim3 blocks(
- (height + BLOCKHEIGHT3 - 1) / BLOCKHEIGHT3,
- (width + BLOCKWIDTH - 1) / BLOCKWIDTH
- );
- dim3 threads(BLOCKWIDTH);
- AT_DISPATCH_FLOATING_TYPES(
- vec.type(), "vecquant3matmul_cuda", ([&] {
- VecQuant3MatMulKernel<<<blocks, threads>>>(
- vec.data<scalar_t>(), mat.data<int>(), mul.data<scalar_t>(),
- scales.data<scalar_t>(), zeros.data<int>(), g_idx.data<int>(),
- batch, vec_height, height, width, zero_width
- );
- })
- );
- }
- template <typename scalar_t>
- __global__ void VecQuant3MatMulKernel(
- const scalar_t* __restrict__ vec,
- const int* __restrict__ mat,
- scalar_t* __restrict__ mul,
- const scalar_t* __restrict__ scales,
- const int* __restrict__ zeros,
- const int* __restrict__ g_idx,
- int batch,
- int vec_height,
- int height,
- int width,
- int zero_width
- ) {
- int h = BLOCKHEIGHT3 * blockIdx.x;
- int w = BLOCKWIDTH * blockIdx.y + threadIdx.x;
- __shared__ scalar_t blockvec[BLOCKWIDTH];
- int i = width * h + w;
- int g_h = (h / 3) * 32;
- int k;
- unsigned int g;
- scalar_t w_tmp;
- int z_w = (w / 32) * 3;
- int z_mod = w % 32;
- int z_bit;
- unsigned int z_tmp;
- if (z_mod != 10){
- if (z_mod != 21){
- z_bit = z_mod;
- if (z_bit > 21){
- z_bit -= 22;
- z_bit *= 3;
- z_bit += 2;
- z_w += 2;
- } else if (z_bit > 10){
- z_bit -= 11;
- z_bit *= 3;
- z_bit += 1;
- z_w += 1;
- } else {
- z_bit *= 3;
- }
- } else {
- z_w += 1;
- }
- }
- float weight[BLOCKWIDTH];
- for (k = 0; k < BLOCKWIDTH; ++k){
- int k_w = (k / 32) * 3;
- int k_mod = k % 32;
- int k_bit;
- if (k_mod != 10){
- if (k_mod != 21){
- k_bit = k_mod;
- if (k_bit > 21){
- k_bit -= 22;
- k_bit *= 3;
- k_bit += 2;
- k_w += 2;
- } else if (k_bit > 10){
- k_bit -= 11;
- k_bit *= 3;
- k_bit += 1;
- k_w += 1;
- } else {
- k_bit *= 3;
- }
- } else {
- k_w += 1;
- }
- }
- g = as_int(g_idx[g_h + k]);
- scalar_t scale = scales[g * width + w];
- scalar_t zero;
- if (z_mod == 10) {
- z_tmp = (as_unsigned(zeros[g * zero_width + z_w]) >> 30) | ((as_unsigned(zeros[g * zero_width + (z_w + 1)]) << 2) & 0x4);
- zero = scalar_t((z_tmp) + 1);
- } else if (z_mod == 21){
- z_tmp = (as_unsigned(zeros[g * zero_width + z_w]) >> 31) | ((as_unsigned(zeros[g * zero_width + (z_w + 1)]) << 1) & 0x6);
- zero = scalar_t((z_tmp) + 1);
- } else {
- zero = scalar_t(((as_unsigned(zeros[g * zero_width + z_w]) >> z_bit) & 0x7) + 1);
- }
- if (k_mod == 10) {
- w_tmp = (as_unsigned(mat[i + (k_w * width)]) >> 30) | ((as_unsigned(mat[i + ((k_w + 1)* width)]) << 2) & 0x4);
- } else if (k_mod == 21){
- w_tmp = (as_unsigned(mat[i + (k_w * width)]) >> 31) | ((as_unsigned(mat[i + ((k_w + 1)* width)]) << 1) & 0x6);
- } else {
- w_tmp = ((as_unsigned(mat[i + (k_w * width)]) >> k_bit) & 0x7);
- }
- weight[k] = scale * (w_tmp - zero);
- }
- scalar_t res;
- for (int b = 0; b < batch; ++b){
- res = 0;
- blockvec[threadIdx.x] = vec[b * vec_height + blockIdx.x * BLOCKWIDTH + threadIdx.x];
- __syncthreads();
- for (k = 0; k < BLOCKWIDTH; ++k){
- res += weight[k] * blockvec[k];
- }
- atomicAdd(&mul[b * width + w], res);
- __syncthreads();
- }
- }
- void vecquant4matmul_cuda(
- torch::Tensor vec,
- torch::Tensor mat,
- torch::Tensor mul,
- torch::Tensor scales,
- torch::Tensor zeros,
- torch::Tensor g_idx
- ) {
- int batch = vec.size(0);
- int vec_height = vec.size(1);
- int height = mat.size(0);
- int width = mat.size(1);
- int zero_width = zeros.size(1);
- dim3 blocks(
- (height + BLOCKHEIGHT4 - 1) / BLOCKHEIGHT4,
- (width + BLOCKWIDTH - 1) / BLOCKWIDTH
- );
- dim3 threads(BLOCKWIDTH);
- AT_DISPATCH_FLOATING_TYPES(
- vec.type(), "vecquant4matmul_cuda", ([&] {
- VecQuant4MatMulKernel<<<blocks, threads>>>(
- vec.data<scalar_t>(), mat.data<int>(), mul.data<scalar_t>(),
- scales.data<scalar_t>(), zeros.data<int>(), g_idx.data<int>(),
- batch, vec_height, height, width, zero_width
- );
- })
- );
- }
- template <typename scalar_t>
- __global__ void VecQuant4MatMulKernel(
- const scalar_t* __restrict__ vec,
- const int* __restrict__ mat,
- scalar_t* __restrict__ mul,
- const scalar_t* __restrict__ scales,
- const int* __restrict__ zeros,
- const int* __restrict__ g_idx,
- int batch,
- int vec_height,
- int height,
- int width,
- int zero_width
- ) {
- int h = BLOCKHEIGHT4 * blockIdx.x;
- int w = BLOCKWIDTH * blockIdx.y + threadIdx.x;
- __shared__ scalar_t blockvec[BLOCKWIDTH];
- int i = width * h + w;
- int g_h = h * 8;
- int k;
- unsigned int g;
- scalar_t w_tmp;
- int z_w = w / 8;
- int z_mod = (w % 8) * 4;
- float weight[BLOCKWIDTH];
- for (k = 0; k < BLOCKWIDTH; ++k){
- int k_w = (k / 8);
- int k_bit = (k % 8) * 4;
- g = as_int(g_idx[g_h + k]);
- scalar_t scale = scales[g * width + w];
- scalar_t zero = scalar_t(((as_unsigned(zeros[g * zero_width + z_w]) >> z_mod) & 0xF) + 1);
- w_tmp = ((as_unsigned(mat[i + (k_w * width)]) >> k_bit) & 0xF);
- weight[k] = scale * (w_tmp - zero);
- }
- scalar_t res;
- for (int b = 0; b < batch; ++b){
- res = 0;
- blockvec[threadIdx.x] = vec[b * vec_height + blockIdx.x * BLOCKWIDTH + threadIdx.x];
- __syncthreads();
- for (k = 0; k < BLOCKWIDTH; ++k){
- res += weight[k] * blockvec[k];
- }
- atomicAdd(&mul[b * width + w], res);
- __syncthreads();
- }
- }
- void vecquant8matmul_cuda(
- torch::Tensor vec,
- torch::Tensor mat,
- torch::Tensor mul,
- torch::Tensor scales,
- torch::Tensor zeros,
- torch::Tensor g_idx
- ) {
- int batch = vec.size(0);
- int vec_height = vec.size(1);
- int height = mat.size(0);
- int width = mat.size(1);
- int zero_width = zeros.size(1);
- dim3 blocks(
- (height + BLOCKHEIGHT8 - 1) / BLOCKHEIGHT8,
- (width + BLOCKWIDTH - 1) / BLOCKWIDTH
- );
- dim3 threads(BLOCKWIDTH);
- AT_DISPATCH_FLOATING_TYPES(
- vec.type(), "vecquant8matmul_cuda", ([&] {
- VecQuant8MatMulKernel<<<blocks, threads>>>(
- vec.data<scalar_t>(), mat.data<int>(), mul.data<scalar_t>(),
- scales.data<scalar_t>(), zeros.data<int>(), g_idx.data<int>(),
- batch, vec_height, height, width, zero_width
- );
- })
- );
- }
- template <typename scalar_t>
- __global__ void VecQuant8MatMulKernel(
- const scalar_t* __restrict__ vec,
- const int* __restrict__ mat,
- scalar_t* __restrict__ mul,
- const scalar_t* __restrict__ scales,
- const int* __restrict__ zeros,
- const int* __restrict__ g_idx,
- int batch,
- int vec_height,
- int height,
- int width,
- int zero_width
- ) {
- int h = BLOCKHEIGHT8 * blockIdx.x;
- int w = BLOCKWIDTH * blockIdx.y + threadIdx.x;
- __shared__ scalar_t blockvec[BLOCKWIDTH];
- int i = width * h + w;
- int g_h = h * 4;
- int k;
- unsigned int g;
- scalar_t w_tmp;
- int z_w = w / 4;
- int z_mod = (w % 4) * 8;
- float weight[BLOCKWIDTH];
- for (k = 0; k < BLOCKWIDTH; ++k){
- int k_w = (k / 4);
- int k_bit = (k % 4) * 8;
- g = as_int(g_idx[g_h + k]);
- scalar_t scale = scales[g * width + w];
- scalar_t zero = scalar_t(((as_unsigned(zeros[g * zero_width + z_w]) >> z_mod) & 0xFF) + 1);
- w_tmp = ((as_unsigned(mat[i + (k_w * width)]) >> k_bit) & 0xFF);
- weight[k] = scale * (w_tmp - zero);
- }
- scalar_t res;
- for (int b = 0; b < batch; ++b){
- res = 0;
- blockvec[threadIdx.x] = vec[b * vec_height + blockIdx.x * BLOCKWIDTH + threadIdx.x];
- __syncthreads();
- for (k = 0; k < BLOCKWIDTH; ++k){
- res += weight[k] * blockvec[k];
- }
- atomicAdd(&mul[b * width + w], res);
- __syncthreads();
- }
- }
|