|
@@ -176,16 +176,14 @@ __global__ void __launch_bounds__(64)
|
|
|
for (int k_0_1 = 0; k_0_1 < 2; ++k_0_1) {
|
|
|
{
|
|
|
unsigned int addr;
|
|
|
- asm(
|
|
|
- "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, "
|
|
|
+ asm("{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, "
|
|
|
"addr; }\n"
|
|
|
: "=r"(addr)
|
|
|
: "l"((void*)((&(A_shared[(k_0_1 * 16)])) +
|
|
|
(((((int)threadIdx.x) & 15) * 40) +
|
|
|
((((int)threadIdx.x) >> 4) * 8)))));
|
|
|
|
|
|
- asm(
|
|
|
- "ldmatrix.sync.aligned.m8n8.x4.shared.b16"
|
|
|
+ asm("ldmatrix.sync.aligned.m8n8.x4.shared.b16"
|
|
|
"{%0, %1, %2, %3}, [%4];\n"
|
|
|
: "=r"(((unsigned*)(A_shared_warp + 0))[0]),
|
|
|
"=r"(((unsigned*)(A_shared_warp + 0))[1]),
|
|
@@ -197,8 +195,7 @@ __global__ void __launch_bounds__(64)
|
|
|
for (int ax1_0 = 0; ax1_0 < N / 32; ++ax1_0) {
|
|
|
{
|
|
|
unsigned int addr;
|
|
|
- asm(
|
|
|
- "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, "
|
|
|
+ asm("{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, "
|
|
|
"addr; }\n"
|
|
|
: "=r"(addr)
|
|
|
: "l"((void*)((&(B_shared[(((k_0_1 * (N * 16 + 128)) +
|
|
@@ -206,8 +203,7 @@ __global__ void __launch_bounds__(64)
|
|
|
(ax1_0 * 16))])) +
|
|
|
(((((int)threadIdx.x) & 15) * (N + 8)) +
|
|
|
((((int)threadIdx.x) >> 4) * 8)))));
|
|
|
- asm(
|
|
|
- "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
|
|
|
+ asm("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
|
|
|
"{%0, %1, %2, %3}, [%4];\n"
|
|
|
: "=r"(((unsigned*)(B_shared_warp + (ax1_0 * 8)))[0]),
|
|
|
"=r"(((unsigned*)(B_shared_warp + (ax1_0 * 8)))[1]),
|
|
@@ -219,8 +215,7 @@ __global__ void __launch_bounds__(64)
|
|
|
for (int j_0_4 = 0; j_0_4 < N / 32; ++j_0_4) {
|
|
|
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
|
|
|
{
|
|
|
- asm(
|
|
|
- "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
|
|
|
+ asm("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
|
|
|
"{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
|
|
|
: "=f"(((float*)(C_warp + (j_0_4 * 8)))[0]),
|
|
|
"=f"(((float*)(C_warp + (j_0_4 * 8)))[1]),
|
|
@@ -236,8 +231,7 @@ __global__ void __launch_bounds__(64)
|
|
|
}
|
|
|
|
|
|
{
|
|
|
- asm(
|
|
|
- "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
|
|
|
+ asm("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
|
|
|
"{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
|
|
|
: "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[0]),
|
|
|
"=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[1]),
|
|
@@ -253,8 +247,7 @@ __global__ void __launch_bounds__(64)
|
|
|
}
|
|
|
|
|
|
{
|
|
|
- asm(
|
|
|
- "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
|
|
|
+ asm("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
|
|
|
"{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
|
|
|
: "=f"(((float*)(C_warp + (j_0_4 * 8)))[0]),
|
|
|
"=f"(((float*)(C_warp + (j_0_4 * 8)))[1]),
|
|
@@ -270,8 +263,7 @@ __global__ void __launch_bounds__(64)
|
|
|
}
|
|
|
|
|
|
{
|
|
|
- asm(
|
|
|
- "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
|
|
|
+ asm("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
|
|
|
"{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
|
|
|
: "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[0]),
|
|
|
"=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[1]),
|
|
@@ -287,8 +279,7 @@ __global__ void __launch_bounds__(64)
|
|
|
}
|
|
|
#else
|
|
|
{
|
|
|
- asm(
|
|
|
- "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
|
|
|
+ asm("mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
|
|
|
"{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, "
|
|
|
"%13};\n"
|
|
|
: "=f"(((float*)(C_warp + (j_0_4 * 8)))[0]),
|
|
@@ -308,8 +299,7 @@ __global__ void __launch_bounds__(64)
|
|
|
}
|
|
|
|
|
|
{
|
|
|
- asm(
|
|
|
- "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
|
|
|
+ asm("mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
|
|
|
"{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, "
|
|
|
"%13};\n"
|
|
|
: "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[0]),
|
|
@@ -558,16 +548,14 @@ __global__ void __launch_bounds__(64) group_gemm_forward_4bit_cuda_m16nXk32(
|
|
|
for (int k_0_1 = 0; k_0_1 < 2; ++k_0_1) {
|
|
|
{
|
|
|
unsigned int addr;
|
|
|
- asm(
|
|
|
- "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, "
|
|
|
+ asm("{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, "
|
|
|
"addr; }\n"
|
|
|
: "=r"(addr)
|
|
|
: "l"((void*)((&(A_shared[(k_0_1 * 16)])) +
|
|
|
(((((int)threadIdx.x) & 15) * 40) +
|
|
|
((((int)threadIdx.x) >> 4) * 8)))));
|
|
|
|
|
|
- asm(
|
|
|
- "ldmatrix.sync.aligned.m8n8.x4.shared.b16"
|
|
|
+ asm("ldmatrix.sync.aligned.m8n8.x4.shared.b16"
|
|
|
"{%0, %1, %2, %3}, [%4];\n"
|
|
|
: "=r"(((unsigned*)(A_shared_warp + 0))[0]),
|
|
|
"=r"(((unsigned*)(A_shared_warp + 0))[1]),
|
|
@@ -579,8 +567,7 @@ __global__ void __launch_bounds__(64) group_gemm_forward_4bit_cuda_m16nXk32(
|
|
|
for (int ax1_0 = 0; ax1_0 < N / 32; ++ax1_0) {
|
|
|
{
|
|
|
unsigned int addr;
|
|
|
- asm(
|
|
|
- "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, "
|
|
|
+ asm("{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, "
|
|
|
"addr; }\n"
|
|
|
: "=r"(addr)
|
|
|
: "l"((void*)((&(B_shared[(((k_0_1 * (N * 16 + 128)) +
|
|
@@ -588,8 +575,7 @@ __global__ void __launch_bounds__(64) group_gemm_forward_4bit_cuda_m16nXk32(
|
|
|
(ax1_0 * 16))])) +
|
|
|
(((((int)threadIdx.x) & 15) * (N + 8)) +
|
|
|
((((int)threadIdx.x) >> 4) * 8)))));
|
|
|
- asm(
|
|
|
- "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
|
|
|
+ asm("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
|
|
|
"{%0, %1, %2, %3}, [%4];\n"
|
|
|
: "=r"(((unsigned*)(B_shared_warp + (ax1_0 * 8)))[0]),
|
|
|
"=r"(((unsigned*)(B_shared_warp + (ax1_0 * 8)))[1]),
|
|
@@ -601,8 +587,7 @@ __global__ void __launch_bounds__(64) group_gemm_forward_4bit_cuda_m16nXk32(
|
|
|
for (int j_0_4 = 0; j_0_4 < N / 32; ++j_0_4) {
|
|
|
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
|
|
|
{
|
|
|
- asm(
|
|
|
- "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
|
|
|
+ asm("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
|
|
|
"{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
|
|
|
: "=f"(((float*)(C_warp + (j_0_4 * 8)))[0]),
|
|
|
"=f"(((float*)(C_warp + (j_0_4 * 8)))[1]),
|
|
@@ -618,8 +603,7 @@ __global__ void __launch_bounds__(64) group_gemm_forward_4bit_cuda_m16nXk32(
|
|
|
}
|
|
|
|
|
|
{
|
|
|
- asm(
|
|
|
- "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
|
|
|
+ asm("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
|
|
|
"{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
|
|
|
: "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[0]),
|
|
|
"=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[1]),
|
|
@@ -635,8 +619,7 @@ __global__ void __launch_bounds__(64) group_gemm_forward_4bit_cuda_m16nXk32(
|
|
|
}
|
|
|
|
|
|
{
|
|
|
- asm(
|
|
|
- "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
|
|
|
+ asm("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
|
|
|
"{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
|
|
|
: "=f"(((float*)(C_warp + (j_0_4 * 8)))[0]),
|
|
|
"=f"(((float*)(C_warp + (j_0_4 * 8)))[1]),
|
|
@@ -652,8 +635,7 @@ __global__ void __launch_bounds__(64) group_gemm_forward_4bit_cuda_m16nXk32(
|
|
|
}
|
|
|
|
|
|
{
|
|
|
- asm(
|
|
|
- "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
|
|
|
+ asm("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
|
|
|
"{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
|
|
|
: "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[0]),
|
|
|
"=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[1]),
|
|
@@ -669,8 +651,7 @@ __global__ void __launch_bounds__(64) group_gemm_forward_4bit_cuda_m16nXk32(
|
|
|
}
|
|
|
#else
|
|
|
{
|
|
|
- asm(
|
|
|
- "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
|
|
|
+ asm("mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
|
|
|
"{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, "
|
|
|
"%13};\n"
|
|
|
: "=f"(((float*)(C_warp + (j_0_4 * 8)))[0]),
|
|
@@ -690,8 +671,7 @@ __global__ void __launch_bounds__(64) group_gemm_forward_4bit_cuda_m16nXk32(
|
|
|
}
|
|
|
|
|
|
{
|
|
|
- asm(
|
|
|
- "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
|
|
|
+ asm("mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
|
|
|
"{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, "
|
|
|
"%13};\n"
|
|
|
: "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[0]),
|