1
0

qdq_3.cuh 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. #ifndef _qdq_3_cuh
  2. #define _qdq_3_cuh
  3. #include "qdq_util.cuh"
  4. namespace aphrodite {
  5. namespace gptq {
  6. // Permutation:
  7. //
  8. // v9997775 55333111 u8886664 44222000 (u, v lsb)
  9. // vjjjhhhf ffdddbbb uiiiggge eecccaaa
  10. // vtttrrrp ppnnnlll usssqqqo oommmkkk
  11. __forceinline__ __device__ void shuffle_3bit_32
  12. (
  13. uint32_t* q,
  14. int stride
  15. )
  16. {
  17. uint32_t qa = q[0 * stride];
  18. uint32_t qb = q[1 * stride];
  19. uint32_t qc = q[2 * stride];
  20. // qa: aa999888 77766655 54443332 22111000
  21. // qb: lkkkjjji iihhhggg fffeeedd dcccbbba
  22. // qc: vvvuuutt tsssrrrq qqpppooo nnnmmmll
  23. uint32_t qd = qc >> 26;
  24. qc <<= 4;
  25. qc |= qb >> 28;
  26. qb <<= 2;
  27. qb |= qa >> 30;
  28. // qa: ..999888 77766655 54443332 22111000
  29. // qb: ..jjjiii hhhgggff feeedddc ccbbbaaa
  30. // qc: ..tttsss rrrqqqpp pooonnnm mmlllkkk
  31. // qd: vvvuuu
  32. uint32_t za = 0;
  33. uint32_t zb = 0;
  34. uint32_t zc = 0;
  35. for (int i = 0; i < 5; i++) { uint32_t t0 = qa & 0x07; uint32_t t1 = (qa & 0x38) >> 3; qa >>= 6; za |= (t0 << (i * 3)); za |= (t1 << (i * 3 + 16)); }
  36. for (int i = 0; i < 5; i++) { uint32_t t0 = qb & 0x07; uint32_t t1 = (qb & 0x38) >> 3; qb >>= 6; zb |= (t0 << (i * 3)); zb |= (t1 << (i * 3 + 16)); }
  37. for (int i = 0; i < 5; i++) { uint32_t t0 = qc & 0x07; uint32_t t1 = (qc & 0x38) >> 3; qc >>= 6; zc |= (t0 << (i * 3)); zc |= (t1 << (i * 3 + 16)); }
  38. // za: 9997775 55333111 8886664 44222000
  39. // zb: jjjhhhf ffdddbbb iiiggge eecccaaa
  40. // zc: tttrrrp ppnnnlll sssqqqo oommmkkk
  41. // qd: vvvuuu
  42. za |= ((qd & 0x01) >> 0) << 15;
  43. zb |= ((qd & 0x02) >> 1) << 15;
  44. zc |= ((qd & 0x04) >> 2) << 15;
  45. za |= ((qd & 0x08) >> 3) << 31;
  46. zb |= ((qd & 0x10) >> 4) << 31;
  47. zc |= ((qd & 0x20) >> 5) << 31;
  48. // za: v9997775 55333111 u8886664 44222000 (u, v lsb)
  49. // zb: vjjjhhhf ffdddbbb uiiiggge eecccaaa
  50. // zc: vtttrrrp ppnnnlll usssqqqo oommmkkk
  51. q[0 * stride] = za;
  52. q[1 * stride] = zb;
  53. q[2 * stride] = zc;
  54. }
  55. __forceinline__ __device__ void dequant_3bit_32
  56. (
  57. const uint32_t q_0,
  58. const uint32_t q_1,
  59. const uint32_t q_2,
  60. half2 (&dq)[16],
  61. int stride,
  62. const uint32_t zero
  63. )
  64. {
  65. const uint32_t c0 = 0x64006400;
  66. const half y8_ = __float2half_rn(1.0f / 8.0f);
  67. const half y64_ = __float2half_rn(1.0f / 64.0f);
  68. const half2 y8 = __halves2half2(y8_, y8_);
  69. const half2 y64 = __halves2half2(y64_, y64_);
  70. const half_uint16 z1_(0xe400 | zero); // half(-1024.0f - zero);
  71. const half z8_ = __hsub(__int2half_rn(-128), __int2half_rn(zero));
  72. const half z64_ = __hsub(__int2half_rn(-16), __int2half_rn(zero));
  73. const half2 z1 = __halves2half2(z1_.as_half, z1_.as_half);
  74. const half2 z8 = __halves2half2(z8_, z8_);
  75. const half2 z64 = __halves2half2(z64_, z64_);
  76. uint32_t qa = q_0;
  77. uint32_t qb = q_1;
  78. uint32_t qc = q_2;
  79. half2_uint32 q0((qa & 0x00070007) | c0); // half2(q[ 0], q[ 1]) + 1024
  80. half2_uint32 q1((qa & 0x00380038) | c0); // half2(q[ 2], q[ 3]) * 8 + 1024
  81. qa >>= 6;
  82. half2_uint32 q2((qa & 0x00070007) | c0); // half2(q[ 4], q[ 5]) + 1024
  83. half2_uint32 q3((qa & 0x00380038) | c0); // half2(q[ 6], q[ 7]) * 8 + 1024
  84. half2_uint32 q4((qa & 0x01c001c0) | c0); // half2(q[ 8], q[ 9]) * 64 + 1024
  85. qa >>= 9;
  86. qa &= 0x00010001;
  87. half2_uint32 q5((qb & 0x00070007) | c0); // half2(q[10], q[11]) + 1024
  88. half2_uint32 q6((qb & 0x00380038) | c0); // half2(q[12], q[13]) * 8 + 1024
  89. qb >>= 6;
  90. half2_uint32 q7((qb & 0x00070007) | c0); // half2(q[14], q[15]) + 1024
  91. half2_uint32 q8((qb & 0x00380038) | c0); // half2(q[16], q[17]) * 8 + 1024
  92. half2_uint32 q9((qb & 0x01c001c0) | c0); // half2(q[18], q[19]) * 64 + 1024
  93. qb >>= 8;
  94. qb &= 0x00020002;
  95. half2_uint32 q10((qc & 0x00070007) | c0); // half2(q[20], q[21]) + 1024
  96. half2_uint32 q11((qc & 0x00380038) | c0); // half2(q[22], q[23]) * 8 + 1024
  97. qc >>= 6;
  98. half2_uint32 q12((qc & 0x00070007) | c0); // half2(q[24], q[25]) + 1024
  99. half2_uint32 q13((qc & 0x00380038) | c0); // half2(q[26], q[27]) * 8 + 1024
  100. half2_uint32 q14((qc & 0x01c001c0) | c0); // half2(q[28], q[29]) * 64 + 1024
  101. qc >>= 7;
  102. qc &= 0x00040004;
  103. half2_uint32 q15((qa | qb | qc) | c0);
  104. dq[ 0] = __hadd2( q0.as_half2, z1);
  105. dq[ 1] = __hfma2( q1.as_half2, y8, z8);
  106. dq[ 2] = __hadd2( q2.as_half2, z1);
  107. dq[ 3] = __hfma2( q3.as_half2, y8, z8);
  108. dq[ 4] = __hfma2( q4.as_half2, y64, z64);
  109. dq[ 5] = __hadd2( q5.as_half2, z1);
  110. dq[ 6] = __hfma2( q6.as_half2, y8, z8);
  111. dq[ 7] = __hadd2( q7.as_half2, z1);
  112. dq[ 8] = __hfma2( q8.as_half2, y8, z8);
  113. dq[ 9] = __hfma2( q9.as_half2, y64, z64);
  114. dq[10] = __hadd2(q10.as_half2, z1);
  115. dq[11] = __hfma2(q11.as_half2, y8, z8);
  116. dq[12] = __hadd2(q12.as_half2, z1);
  117. dq[13] = __hfma2(q13.as_half2, y8, z8);
  118. dq[14] = __hfma2(q14.as_half2, y64, z64);
  119. dq[15] = __hadd2(q15.as_half2, z1);
  120. }
  121. } // namespace gptq
  122. } // namespace aphrodite
  123. #endif