1
0

vecdotq.cuh 66 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745
  1. // copied and adapted from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/vecdotq.cuh
  2. // and https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/mmq.cu
  3. static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
  4. const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
  5. int x32 = 0;
  6. x32 |= x16[0] << 0;
  7. x32 |= x16[1] << 16;
  8. return x32;
  9. }
  10. static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
  11. const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
  12. int x32 = 0;
  13. x32 |= x16[0] << 0;
  14. x32 |= x16[1] << 16;
  15. return x32;
  16. }
  17. static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
  18. return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
  19. }
  20. static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
  21. return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
  22. }
  23. #define VDR_Q4_0_Q8_1_MMVQ 2
  24. #define VDR_Q4_0_Q8_1_MMQ 4
  25. template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl(
  26. const int * v, const int * u, const float & d4, const half2 & ds8) {
  27. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  28. int sumi = 0;
  29. #pragma unroll
  30. for (int i = 0; i < vdr; ++i) {
  31. const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
  32. const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
  33. // SIMD dot product of quantized values
  34. sumi = __dp4a(vi0, u[2*i+0], sumi);
  35. sumi = __dp4a(vi1, u[2*i+1], sumi);
  36. }
  37. const float2 ds8f = __half22float2(ds8);
  38. // second part effectively subtracts 8 from each quant value
  39. return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
  40. #endif
  41. }
  42. #define VDR_Q4_1_Q8_1_MMVQ 2
  43. #define VDR_Q4_1_Q8_1_MMQ 4
  44. template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl(
  45. const int * v, const int * u, const half2 & dm4, const half2 & ds8) {
  46. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  47. int sumi = 0;
  48. #pragma unroll
  49. for (int i = 0; i < vdr; ++i) {
  50. const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
  51. const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
  52. // SIMD dot product of quantized values
  53. sumi = __dp4a(vi0, u[2*i+0], sumi);
  54. sumi = __dp4a(vi1, u[2*i+1], sumi);
  55. }
  56. const float2 tmp = __half22float2(__hmul2(dm4, ds8));
  57. const float d4d8 = tmp.x;
  58. const float m4s8 = tmp.y;
  59. // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
  60. return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
  61. #endif
  62. }
  63. #define VDR_Q5_0_Q8_1_MMVQ 2
  64. #define VDR_Q5_0_Q8_1_MMQ 4
  65. template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl(
  66. const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) {
  67. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  68. int sumi = 0;
  69. #pragma unroll
  70. for (int i = 0; i < vdr; ++i) {
  71. int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
  72. vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
  73. vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12
  74. vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20
  75. vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28
  76. sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
  77. int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
  78. vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4
  79. vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12
  80. vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20
  81. vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28
  82. sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
  83. }
  84. const float2 ds8f = __half22float2(ds8);
  85. // second part effectively subtracts 16 from each quant value
  86. return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
  87. #endif
  88. }
  89. #define VDR_Q5_1_Q8_1_MMVQ 2
  90. #define VDR_Q5_1_Q8_1_MMQ 4
  91. template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl(
  92. const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) {
  93. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  94. int sumi = 0;
  95. #pragma unroll
  96. for (int i = 0; i < vdr; ++i) {
  97. int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
  98. vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
  99. vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12
  100. vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20
  101. vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28
  102. sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
  103. int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
  104. vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4
  105. vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12
  106. vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20
  107. vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28
  108. sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
  109. }
  110. const float2 tmp = __half22float2(__hmul2(dm5, ds8));
  111. const float d5d8 = tmp.x;
  112. const float m5s8 = tmp.y;
  113. // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
  114. return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
  115. #endif
  116. }
  117. #define VDR_Q8_0_Q8_1_MMVQ 2
  118. #define VDR_Q8_0_Q8_1_MMQ 8
  119. template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl(
  120. const int * v, const int * u, const float & d8_0, const float & d8_1) {
  121. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  122. int sumi = 0;
  123. #pragma unroll
  124. for (int i = 0; i < vdr; ++i) {
  125. // SIMD dot product of quantized values
  126. sumi = __dp4a(v[i], u[i], sumi);
  127. }
  128. return d8_0*d8_1 * sumi;
  129. #endif
  130. }
  131. template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl(
  132. const int * v, const int * u, const half2 & dm8, const half2 & ds8) {
  133. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  134. int sumi = 0;
  135. #pragma unroll
  136. for (int i = 0; i < vdr; ++i) {
  137. // SIMD dot product of quantized values
  138. sumi = __dp4a(v[i], u[i], sumi);
  139. }
  140. const float2 tmp = __half22float2(__hmul2(dm8, ds8));
  141. const float d8d8 = tmp.x;
  142. const float m8s8 = tmp.y;
  143. // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
  144. return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
  145. #endif
  146. }
  147. #define VDR_Q2_K_Q8_1_MMVQ 1
  148. #define VDR_Q2_K_Q8_1_MMQ 2
  149. // contiguous v/x values
  150. static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
  151. const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
  152. const half2 & dm2, const float * __restrict__ d8) {
  153. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  154. float sumf_d = 0.0f;
  155. float sumf_m = 0.0f;
  156. #pragma unroll
  157. for (int i = 0; i < QR2_K; ++i) {
  158. const int sc = scales[2*i];
  159. const int vi = (v >> (2*i)) & 0x03030303;
  160. sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
  161. // fill int with 4x m
  162. int m = sc >> 4;
  163. m |= m << 8;
  164. m |= m << 16;
  165. sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
  166. }
  167. const float2 dm2f = __half22float2(dm2);
  168. return dm2f.x*sumf_d - dm2f.y*sumf_m;
  169. #endif
  170. }
  171. static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
  172. const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
  173. const half2 & dm2, const float & d8) {
  174. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  175. int sumi_d = 0;
  176. int sumi_m = 0;
  177. #pragma unroll
  178. for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) {
  179. int sumi_d_sc = 0;
  180. const int sc = scales[i0 / (QI8_1/2)];
  181. // fill int with 4x m
  182. int m = sc >> 4;
  183. m |= m << 8;
  184. m |= m << 16;
  185. #pragma unroll
  186. for (int i = i0; i < i0 + QI8_1/2; ++i) {
  187. sumi_d_sc = __dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product
  188. sumi_m = __dp4a(m, u[i], sumi_m); // multiply sum of q8_1 values with m
  189. }
  190. sumi_d += sumi_d_sc * (sc & 0xF);
  191. }
  192. const float2 dm2f = __half22float2(dm2);
  193. return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
  194. #endif
  195. }
  196. #define VDR_Q3_K_Q8_1_MMVQ 1
  197. #define VDR_Q3_K_Q8_1_MMQ 2
  198. // contiguous v/x values
  199. static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
  200. const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
  201. const int & scale_offset, const float & d3, const float * __restrict__ d8) {
  202. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  203. float sumf = 0.0f;
  204. #pragma unroll
  205. for (int i = 0; i < QR3_K; ++i) {
  206. const int isc = scale_offset + 2*i;
  207. const int isc_low = isc % (QK_K/32);
  208. const int sc_shift_low = 4 * (isc / (QK_K/32));
  209. const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF;
  210. const int isc_high = isc % (QK_K/64);
  211. const int sc_shift_high = 2 * (isc / (QK_K/64));
  212. const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
  213. const int sc = (sc_low | sc_high) - 32;
  214. const int vil = (vl >> (2*i)) & 0x03030303;
  215. const int vih = ((vh >> i) << 2) & 0x04040404;
  216. const int vi = __vsubss4(vil, vih);
  217. sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
  218. }
  219. return d3 * sumf;
  220. #endif
  221. }
  222. static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
  223. const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales,
  224. const float & d3, const float & d8) {
  225. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  226. int sumi = 0;
  227. #pragma unroll
  228. for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) {
  229. int sumi_sc = 0;
  230. for (int i = i0; i < i0 + QI8_1/2; ++i) {
  231. sumi_sc = __dp4a(v[i], u[i], sumi_sc); // SIMD dot product
  232. }
  233. sumi += sumi_sc * scales[i0 / (QI8_1/2)];
  234. }
  235. return d3*d8 * sumi;
  236. #endif
  237. }
  238. #define VDR_Q4_K_Q8_1_MMVQ 2
  239. #define VDR_Q4_K_Q8_1_MMQ 8
  240. // contiguous v/x values
  241. static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
  242. const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
  243. const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
  244. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  245. float sumf_d = 0.0f;
  246. float sumf_m = 0.0f;
  247. #pragma unroll
  248. for (int i = 0; i < QR4_K; ++i) {
  249. const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
  250. const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
  251. const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
  252. const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u
  253. sumf_d += d8[i] * (dot1 * sc[i]);
  254. sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
  255. }
  256. const float2 dm4f = __half22float2(dm4);
  257. return dm4f.x*sumf_d - dm4f.y*sumf_m;
  258. #endif
  259. }
  260. static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
  261. const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
  262. const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
  263. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  264. float sumf_d = 0.0f;
  265. float sumf_m = 0.0f;
  266. #pragma unroll
  267. for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) {
  268. int sumi_d = 0;
  269. #pragma unroll
  270. for (int j = 0; j < QI8_1; ++j) {
  271. sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product
  272. }
  273. const float2 ds8f = __half22float2(ds8[i]);
  274. sumf_d += ds8f.x * (sc[i] * sumi_d);
  275. sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val
  276. }
  277. const float2 dm4f = __half22float2(dm4);
  278. return dm4f.x*sumf_d - dm4f.y*sumf_m;
  279. #endif
  280. }
  281. #define VDR_Q5_K_Q8_1_MMVQ 2
  282. #define VDR_Q5_K_Q8_1_MMQ 8
  283. static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
  284. const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
  285. const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
  286. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  287. float sumf_d = 0.0f;
  288. float sumf_m = 0.0f;
  289. #pragma unroll
  290. for (int i = 0; i < QR5_K; ++i) {
  291. const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
  292. const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
  293. const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
  294. const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
  295. const int v0i = vl0i | vh0i;
  296. const int v1i = vl1i | vh1i;
  297. const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
  298. const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u
  299. sumf_d += d8[i] * (dot1 * sc[i]);
  300. sumf_m += d8[i] * (dot2 * m[i]);
  301. }
  302. const float2 dm5f = __half22float2(dm5);
  303. return dm5f.x*sumf_d - dm5f.y*sumf_m;
  304. #endif
  305. }
  306. static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
  307. const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
  308. const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
  309. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  310. float sumf_d = 0.0f;
  311. float sumf_m = 0.0f;
  312. #pragma unroll
  313. for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) {
  314. int sumi_d = 0;
  315. #pragma unroll
  316. for (int j = 0; j < QI8_1; ++j) {
  317. sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product
  318. }
  319. const float2 ds8f = __half22float2(ds8[i]);
  320. sumf_d += ds8f.x * (sc[i] * sumi_d);
  321. sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val
  322. }
  323. const float2 dm4f = __half22float2(dm4);
  324. return dm4f.x*sumf_d - dm4f.y*sumf_m;
  325. #endif
  326. }
  327. #define VDR_Q6_K_Q8_1_MMVQ 1
  328. #define VDR_Q6_K_Q8_1_MMQ 8
  329. // contiguous v/x values
  330. static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
  331. const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
  332. const float & d, const float * __restrict__ d8) {
  333. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  334. float sumf = 0.0f;
  335. #pragma unroll
  336. for (int i = 0; i < QR6_K; ++i) {
  337. const int sc = scales[4*i];
  338. const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
  339. const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
  340. const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
  341. sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
  342. }
  343. return d*sumf;
  344. #endif
  345. }
  346. static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
  347. const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc,
  348. const float & d6, const float * __restrict__ d8) {
  349. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  350. float sumf_d = 0.0f;
  351. #pragma unroll
  352. for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) {
  353. int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale
  354. #pragma unroll
  355. for (int i = i0; i < i0 + 2; ++i) {
  356. sumi_d.x = __dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product
  357. sumi_d.x = __dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product
  358. sumi_d.y = __dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product
  359. sumi_d.y = __dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product
  360. }
  361. sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y);
  362. }
  363. return d6 * sumf_d;
  364. #endif
  365. }
  366. static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
  367. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  368. const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
  369. int v[VDR_Q4_0_Q8_1_MMVQ];
  370. int u[2*VDR_Q4_0_Q8_1_MMVQ];
  371. #pragma unroll
  372. for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
  373. v[i] = get_int_from_uint8(bq4_0->qs, iqs + i);
  374. u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
  375. u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
  376. }
  377. return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, __half2float(bq4_0->d), bq8_1->ds);
  378. }
  379. template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
  380. __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
  381. __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
  382. *x_ql = tile_x_qs;
  383. *x_dm = (half2 *) tile_x_d;
  384. }
  385. template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
  386. const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
  387. int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
  388. const int kbx = k / QI4_0;
  389. const int kqsx = k % QI4_0;
  390. const block_q4_0 * bx0 = (const block_q4_0 *) vx;
  391. float * x_dmf = (float *) x_dm;
  392. #pragma unroll
  393. for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
  394. int i = i0 + i_offset;
  395. if (need_check) {
  396. i = min(i, i_max);
  397. }
  398. const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
  399. x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
  400. // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
  401. }
  402. const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
  403. const int kbxd = k % blocks_per_tile_x_row;
  404. #pragma unroll
  405. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
  406. int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
  407. if (need_check) {
  408. i = min(i, i_max);
  409. }
  410. const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
  411. x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = __half2float(bxi->d);
  412. }
  413. }
  414. static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
  415. const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
  416. const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
  417. (void)x_qh; (void)x_sc;
  418. const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
  419. const float * x_dmf = (const float *) x_dm;
  420. int u[2*VDR_Q4_0_Q8_1_MMQ];
  421. #pragma unroll
  422. for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
  423. u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
  424. u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
  425. }
  426. return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
  427. (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
  428. y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
  429. }
  430. static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
  431. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  432. const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
  433. int v[VDR_Q4_1_Q8_1_MMVQ];
  434. int u[2*VDR_Q4_1_Q8_1_MMVQ];
  435. #pragma unroll
  436. for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) {
  437. v[i] = get_int_from_uint8_aligned(bq4_1->qs, iqs + i);
  438. u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
  439. u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1);
  440. }
  441. return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
  442. }
  443. template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
  444. __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y];
  445. __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
  446. *x_ql = tile_x_qs;
  447. *x_dm = tile_x_dm;
  448. }
  449. template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
  450. const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
  451. int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
  452. const int kbx = k / QI4_1;
  453. const int kqsx = k % QI4_1;
  454. const block_q4_1 * bx0 = (const block_q4_1 *) vx;
  455. #pragma unroll
  456. for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
  457. int i = i0 + i_offset;
  458. if (need_check) {
  459. i = min(i, i_max);
  460. }
  461. const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx;
  462. x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
  463. }
  464. const int blocks_per_tile_x_row = WARP_SIZE / QI4_1;
  465. const int kbxd = k % blocks_per_tile_x_row;
  466. #pragma unroll
  467. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
  468. int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
  469. if (need_check) {
  470. i = min(i, i_max);
  471. }
  472. const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd;
  473. x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm;
  474. }
  475. }
  476. static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
  477. const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
  478. const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
  479. const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
  480. int u[2*VDR_Q4_1_Q8_1_MMQ];
  481. #pragma unroll
  482. for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
  483. u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
  484. u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
  485. }
  486. return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
  487. (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
  488. y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
  489. }
  490. static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
  491. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  492. const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
  493. int vl[VDR_Q5_0_Q8_1_MMVQ];
  494. int vh[VDR_Q5_0_Q8_1_MMVQ];
  495. int u[2*VDR_Q5_0_Q8_1_MMVQ];
  496. #pragma unroll
  497. for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) {
  498. vl[i] = get_int_from_uint8(bq5_0->qs, iqs + i);
  499. vh[i] = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i));
  500. u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
  501. u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0);
  502. }
  503. return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, __half2float(bq5_0->d), bq8_1->ds);
  504. }
  505. template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
  506. __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
  507. __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
  508. *x_ql = tile_x_ql;
  509. *x_dm = (half2 *) tile_x_d;
  510. }
  511. template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
  512. const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
  513. int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
  514. const int kbx = k / QI5_0;
  515. const int kqsx = k % QI5_0;
  516. const block_q5_0 * bx0 = (const block_q5_0 *) vx;
  517. #pragma unroll
  518. for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
  519. int i = i0 + i_offset;
  520. if (need_check) {
  521. i = min(i, i_max);
  522. }
  523. const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx;
  524. const int ql = get_int_from_uint8(bxi->qs, kqsx);
  525. const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0));
  526. int qs0 = (ql >> 0) & 0x0F0F0F0F;
  527. qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
  528. qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
  529. qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
  530. qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
  531. qs0 = __vsubss4(qs0, 0x10101010); // subtract 16
  532. x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
  533. int qs1 = (ql >> 4) & 0x0F0F0F0F;
  534. qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
  535. qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
  536. qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
  537. qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
  538. qs1 = __vsubss4(qs1, 0x10101010); // subtract 16
  539. x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
  540. }
  541. const int blocks_per_tile_x_row = WARP_SIZE / QI5_0;
  542. const int kbxd = k % blocks_per_tile_x_row;
  543. float * x_dmf = (float *) x_dm;
  544. #pragma unroll
  545. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
  546. int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
  547. if (need_check) {
  548. i = min(i, i_max);
  549. }
  550. const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd;
  551. x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = __half2float(bxi->d);
  552. }
  553. }
  554. static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
  555. const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
  556. const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
  557. const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
  558. const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
  559. const float * x_dmf = (const float *) x_dm;
  560. const float * y_df = (const float *) y_ds;
  561. int u[2*VDR_Q5_0_Q8_1_MMQ];
  562. #pragma unroll
  563. for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
  564. u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
  565. u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
  566. }
  567. return vec_dot_q8_0_q8_1_impl<QR5_0*VDR_Q5_0_Q8_1_MMQ>
  568. (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
  569. }
  570. static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
  571. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  572. const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
  573. int vl[VDR_Q5_1_Q8_1_MMVQ];
  574. int vh[VDR_Q5_1_Q8_1_MMVQ];
  575. int u[2*VDR_Q5_1_Q8_1_MMVQ];
  576. #pragma unroll
  577. for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) {
  578. vl[i] = get_int_from_uint8_aligned(bq5_1->qs, iqs + i);
  579. vh[i] = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i));
  580. u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
  581. u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1);
  582. }
  583. return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
  584. }
  585. template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
  586. __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
  587. __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
  588. *x_ql = tile_x_ql;
  589. *x_dm = tile_x_dm;
  590. }
  591. template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
  592. const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
  593. int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
  594. const int kbx = k / QI5_1;
  595. const int kqsx = k % QI5_1;
  596. const block_q5_1 * bx0 = (const block_q5_1 *) vx;
  597. #pragma unroll
  598. for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
  599. int i = i0 + i_offset;
  600. if (need_check) {
  601. i = min(i, i_max);
  602. }
  603. const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx;
  604. const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
  605. const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1));
  606. int qs0 = (ql >> 0) & 0x0F0F0F0F;
  607. qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
  608. qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
  609. qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
  610. qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
  611. x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
  612. int qs1 = (ql >> 4) & 0x0F0F0F0F;
  613. qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
  614. qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
  615. qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
  616. qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
  617. x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
  618. }
  619. const int blocks_per_tile_x_row = WARP_SIZE / QI5_1;
  620. const int kbxd = k % blocks_per_tile_x_row;
  621. #pragma unroll
  622. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
  623. int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
  624. if (need_check) {
  625. i = min(i, i_max);
  626. }
  627. const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd;
  628. x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm;
  629. }
  630. }
  631. static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
  632. const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
  633. const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
  634. const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
  635. const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
  636. int u[2*VDR_Q5_1_Q8_1_MMQ];
  637. #pragma unroll
  638. for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
  639. u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
  640. u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
  641. }
  642. return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
  643. (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
  644. }
  645. static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
  646. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  647. const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
  648. int v[VDR_Q8_0_Q8_1_MMVQ];
  649. int u[VDR_Q8_0_Q8_1_MMVQ];
  650. #pragma unroll
  651. for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
  652. v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
  653. u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
  654. }
  655. return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, __half2float(bq8_0->d), __low2float(bq8_1->ds));
  656. }
  657. template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
  658. __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
  659. __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
  660. *x_ql = tile_x_qs;
  661. *x_dm = (half2 *) tile_x_d;
  662. }
  663. template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
  664. const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
  665. int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
  666. const int kbx = k / QI8_0;
  667. const int kqsx = k % QI8_0;
  668. float * x_dmf = (float *) x_dm;
  669. const block_q8_0 * bx0 = (const block_q8_0 *) vx;
  670. #pragma unroll
  671. for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
  672. int i = i0 + i_offset;
  673. if (need_check) {
  674. i = min(i, i_max);
  675. }
  676. const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
  677. x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
  678. }
  679. const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
  680. const int kbxd = k % blocks_per_tile_x_row;
  681. #pragma unroll
  682. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
  683. int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
  684. if (need_check) {
  685. i = min(i, i_max);
  686. }
  687. const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
  688. x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = __half2float(bxi->d);
  689. }
  690. }
  691. static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
  692. const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
  693. const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
  694. const float * x_dmf = (const float *) x_dm;
  695. const float * y_df = (const float *) y_ds;
  696. return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ>
  697. (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
  698. y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
  699. }
  700. static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
  701. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  702. const block_q2_K * bq2_K = (const block_q2_K *) vbq;
  703. const int bq8_offset = QR2_K * (iqs / QI8_1);
  704. const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
  705. const uint8_t * scales = bq2_K->scales + scale_offset;
  706. const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
  707. int u[QR2_K];
  708. float d8[QR2_K];
  709. #pragma unroll
  710. for (int i = 0; i < QR2_K; ++ i) {
  711. u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
  712. d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
  713. }
  714. return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
  715. }
  716. template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
  717. __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
  718. __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
  719. __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4];
  720. *x_ql = tile_x_ql;
  721. *x_dm = tile_x_dm;
  722. *x_sc = tile_x_sc;
  723. }
  724. template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
  725. const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
  726. int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
  727. const int kbx = k / QI2_K;
  728. const int kqsx = k % QI2_K;
  729. const block_q2_K * bx0 = (const block_q2_K *) vx;
  730. #pragma unroll
  731. for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
  732. int i = i0 + i_offset;
  733. if (need_check) {
  734. i = min(i, i_max);
  735. }
  736. const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx;
  737. x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
  738. }
  739. const int blocks_per_tile_x_row = WARP_SIZE / QI2_K;
  740. const int kbxd = k % blocks_per_tile_x_row;
  741. #pragma unroll
  742. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
  743. int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y;
  744. if (need_check) {
  745. i = min(i, i_max);
  746. }
  747. const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd;
  748. x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm;
  749. }
  750. #pragma unroll
  751. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
  752. int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
  753. if (need_check) {
  754. i = min(i, i_max);
  755. }
  756. const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4);
  757. x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4));
  758. }
  759. }
  760. static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
  761. const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
  762. const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
  763. const int kbx = k / QI2_K;
  764. const int ky = (k % QI2_K) * QR2_K;
  765. const float * y_df = (const float *) y_ds;
  766. int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
  767. const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
  768. const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
  769. #pragma unroll
  770. for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) {
  771. v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
  772. }
  773. const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
  774. const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE;
  775. return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
  776. }
  777. static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
  778. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  779. const block_q3_K * bq3_K = (const block_q3_K *) vbq;
  780. const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
  781. const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
  782. const float d = __half2float(bq3_K->d);
  783. const int vl = get_int_from_uint8(bq3_K->qs, iqs);
  784. // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
  785. const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
  786. int u[QR3_K];
  787. float d8[QR3_K];
  788. #pragma unroll
  789. for (int i = 0; i < QR3_K; ++i) {
  790. u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
  791. d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
  792. }
  793. return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
  794. }
  795. template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
  796. __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
  797. __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K];
  798. __shared__ int tile_x_qh[mmq_y * (WARP_SIZE/2) + mmq_y/2];
  799. __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4];
  800. *x_ql = tile_x_ql;
  801. *x_dm = tile_x_dm;
  802. *x_qh = tile_x_qh;
  803. *x_sc = tile_x_sc;
  804. }
  805. template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
  806. const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
  807. int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
  808. const int kbx = k / QI3_K;
  809. const int kqsx = k % QI3_K;
  810. const block_q3_K * bx0 = (const block_q3_K *) vx;
  811. #pragma unroll
  812. for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
  813. int i = i0 + i_offset;
  814. if (need_check) {
  815. i = min(i, i_max);
  816. }
  817. const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx;
  818. x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
  819. }
  820. const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
  821. const int kbxd = k % blocks_per_tile_x_row;
  822. float * x_dmf = (float *) x_dm;
  823. #pragma unroll
  824. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
  825. int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y;
  826. if (need_check) {
  827. i = min(i, i_max);
  828. }
  829. const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
  830. x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = __half2float(bxi->d);
  831. }
  832. #pragma unroll
  833. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
  834. int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
  835. if (need_check) {
  836. i = min(i, i_max);
  837. }
  838. const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2);
  839. // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
  840. x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
  841. }
  842. #pragma unroll
  843. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
  844. int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
  845. if (need_check) {
  846. i = min(i, i_max);
  847. }
  848. const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4);
  849. const int ksc = k % (QI3_K/4);
  850. const int ksc_low = ksc % (QI3_K/8);
  851. const int shift_low = 4 * (ksc / (QI3_K/8));
  852. const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;
  853. const int ksc_high = QI3_K/8;
  854. const int shift_high = 2 * ksc;
  855. const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;
  856. const int sc = __vsubss4(sc_low | sc_high, 0x20202020);
  857. x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc;
  858. }
  859. }
  860. static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
  861. const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
  862. const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
  863. const int kbx = k / QI3_K;
  864. const int ky = (k % QI3_K) * QR3_K;
  865. const float * x_dmf = (const float *) x_dm;
  866. const float * y_df = (const float *) y_ds;
  867. const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
  868. int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
  869. #pragma unroll
  870. for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
  871. const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
  872. const int shift = 2 * ((ky % 32) / 8);
  873. const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
  874. const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
  875. const int vlh = (vh << 2) & 0x04040404;
  876. v[l] = __vsubss4(vll, vlh);
  877. }
  878. const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE;
  879. return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
  880. }
  881. static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
  882. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  883. const block_q4_K * bq4_K = (const block_q4_K *) vbq;
  884. int v[2];
  885. int u[2*QR4_K];
  886. float d8[QR4_K];
  887. // iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6
  888. const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2));
  889. // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
  890. // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
  891. // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
  892. // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
  893. const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
  894. v[0] = q4[0];
  895. v[1] = q4[4];
  896. const uint16_t * scales = (const uint16_t *)bq4_K->scales;
  897. uint16_t aux[2];
  898. const int j = bq8_offset/2;
  899. if (j < 2) {
  900. aux[0] = scales[j+0] & 0x3f3f;
  901. aux[1] = scales[j+2] & 0x3f3f;
  902. } else {
  903. aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
  904. aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
  905. }
  906. const uint8_t * sc = (const uint8_t *)aux;
  907. const uint8_t * m = sc + 2;
  908. for (int i = 0; i < QR4_K; ++i) {
  909. const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
  910. d8[i] = __low2float(bq8i->ds);
  911. const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
  912. u[2*i+0] = q8[0];
  913. u[2*i+1] = q8[4];
  914. }
  915. return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
  916. }
  917. template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
  918. __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
  919. __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
  920. __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
  921. *x_ql = tile_x_ql;
  922. *x_dm = tile_x_dm;
  923. *x_sc = tile_x_sc;
  924. }
  925. template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
  926. const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
  927. int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
  928. const int kbx = k / QI4_K; // == 0 if QK_K == 256
  929. const int kqsx = k % QI4_K; // == k if QK_K == 256
  930. const block_q4_K * bx0 = (const block_q4_K *) vx;
  931. #pragma unroll
  932. for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
  933. int i = i0 + i_offset;
  934. if (need_check) {
  935. i = min(i, i_max);
  936. }
  937. const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx;
  938. x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
  939. }
  940. const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
  941. const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
  942. #pragma unroll
  943. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
  944. int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y;
  945. if (need_check) {
  946. i = min(i, i_max);
  947. }
  948. const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
  949. x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
  950. }
  951. #pragma unroll
  952. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
  953. int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
  954. if (need_check) {
  955. i = min(i, i_max);
  956. }
  957. const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
  958. const int * scales = (const int *) bxi->scales;
  959. const int ksc = k % (WARP_SIZE/8);
  960. // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
  961. int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
  962. scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
  963. x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
  964. }
  965. }
  966. static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
  967. const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
  968. const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
  969. (void)x_qh;
  970. const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
  971. const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
  972. return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8,
  973. x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
  974. }
  975. static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
  976. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  977. const block_q5_K * bq5_K = (const block_q5_K *) vbq;
  978. int vl[2];
  979. int vh[2];
  980. int u[2*QR5_K];
  981. float d8[QR5_K];
  982. const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2));
  983. const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
  984. const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4));
  985. vl[0] = ql[0];
  986. vl[1] = ql[4];
  987. vh[0] = qh[0] >> bq8_offset;
  988. vh[1] = qh[4] >> bq8_offset;
  989. const uint16_t * scales = (const uint16_t *)bq5_K->scales;
  990. uint16_t aux[2];
  991. const int j = bq8_offset/2;
  992. if (j < 2) {
  993. aux[0] = scales[j+0] & 0x3f3f;
  994. aux[1] = scales[j+2] & 0x3f3f;
  995. } else {
  996. aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
  997. aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
  998. }
  999. const uint8_t * sc = (const uint8_t *)aux;
  1000. const uint8_t * m = sc + 2;
  1001. #pragma unroll
  1002. for (int i = 0; i < QR5_K; ++i) {
  1003. const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
  1004. d8[i] = __low2float(bq8i->ds);
  1005. const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
  1006. u[2*i+0] = q8[0];
  1007. u[2*i+1] = q8[4];
  1008. }
  1009. return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
  1010. }
  1011. template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
  1012. __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
  1013. __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
  1014. __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
  1015. *x_ql = tile_x_ql;
  1016. *x_dm = tile_x_dm;
  1017. *x_sc = tile_x_sc;
  1018. }
  1019. template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
  1020. const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
  1021. int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
  1022. const int kbx = k / QI5_K; // == 0 if QK_K == 256
  1023. const int kqsx = k % QI5_K; // == k if QK_K == 256
  1024. const block_q5_K * bx0 = (const block_q5_K *) vx;
  1025. #pragma unroll
  1026. for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
  1027. int i = i0 + i_offset;
  1028. if (need_check) {
  1029. i = min(i, i_max);
  1030. }
  1031. const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
  1032. const int ky = QR5_K*kqsx;
  1033. const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
  1034. const int ql0 = (ql >> 0) & 0x0F0F0F0F;
  1035. const int ql1 = (ql >> 4) & 0x0F0F0F0F;
  1036. const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4));
  1037. const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010;
  1038. const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010;
  1039. const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0;
  1040. const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4);
  1041. x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0;
  1042. x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
  1043. }
  1044. const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
  1045. const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
  1046. #pragma unroll
  1047. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
  1048. int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y;
  1049. if (need_check) {
  1050. i = min(i, i_max);
  1051. }
  1052. const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
  1053. x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
  1054. }
  1055. #pragma unroll
  1056. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
  1057. int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
  1058. if (need_check) {
  1059. i = min(i, i_max);
  1060. }
  1061. const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
  1062. const int * scales = (const int *) bxi->scales;
  1063. const int ksc = k % (WARP_SIZE/8);
  1064. // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
  1065. int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
  1066. scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
  1067. x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
  1068. }
  1069. }
  1070. static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
  1071. const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
  1072. const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
  1073. const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
  1074. const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k;
  1075. const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE;
  1076. return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
  1077. x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
  1078. }
  1079. static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
  1080. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  1081. const block_q6_K * bq6_K = (const block_q6_K *) vbq;
  1082. const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
  1083. const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
  1084. const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
  1085. const int vl = get_int_from_uint8(bq6_K->ql, iqs);
  1086. const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift;
  1087. const int8_t * scales = bq6_K->scales + scale_offset;
  1088. int u[QR6_K];
  1089. float d8[QR6_K];
  1090. #pragma unroll
  1091. for (int i = 0; i < QR6_K; ++i) {
  1092. u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
  1093. d8[i] = __low2float(bq8_1[bq8_offset + 2*i].ds);
  1094. }
  1095. return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, __half2float(bq6_K->d), d8);
  1096. }
  1097. template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
  1098. __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
  1099. __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
  1100. __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
  1101. *x_ql = tile_x_ql;
  1102. *x_dm = tile_x_dm;
  1103. *x_sc = tile_x_sc;
  1104. }
  1105. template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
  1106. const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
  1107. int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
  1108. const int kbx = k / QI6_K; // == 0 if QK_K == 256
  1109. const int kqsx = k % QI6_K; // == k if QK_K == 256
  1110. const block_q6_K * bx0 = (const block_q6_K *) vx;
  1111. #pragma unroll
  1112. for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
  1113. int i = i0 + i_offset;
  1114. if (need_check) {
  1115. i = min(i, i_max);
  1116. }
  1117. const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
  1118. const int ky = QR6_K*kqsx;
  1119. const int ql = get_int_from_uint8(bxi->ql, kqsx);
  1120. const int ql0 = (ql >> 0) & 0x0F0F0F0F;
  1121. const int ql1 = (ql >> 4) & 0x0F0F0F0F;
  1122. const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4));
  1123. const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030;
  1124. const int qh1 = (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) & 0x30303030;
  1125. const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0;
  1126. const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2);
  1127. x_ql[i * (2*WARP_SIZE + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
  1128. x_ql[i * (2*WARP_SIZE + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
  1129. }
  1130. const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
  1131. const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
  1132. float * x_dmf = (float *) x_dm;
  1133. #pragma unroll
  1134. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
  1135. int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y;
  1136. if (need_check) {
  1137. i = min(i, i_max);
  1138. }
  1139. const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
  1140. x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = __half2float(bxi->d);
  1141. }
  1142. #pragma unroll
  1143. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
  1144. int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
  1145. if (need_check) {
  1146. i = min(i, i_max);
  1147. }
  1148. const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4;
  1149. x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8));
  1150. }
  1151. }
  1152. static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
  1153. const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
  1154. const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
  1155. const float * x_dmf = (const float *) x_dm;
  1156. const float * y_df = (const float *) y_ds;
  1157. const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]);
  1158. const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k;
  1159. const int index_y = j * WARP_SIZE + (QR6_K*k) % WARP_SIZE;
  1160. return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
  1161. }
  1162. static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
  1163. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  1164. const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
  1165. const int ib32 = iqs;
  1166. const uint16_t * q2 = bq2->qs + 4*ib32;
  1167. const uint8_t * aux8 = (const uint8_t *)q2;
  1168. const int8_t * q8 = bq8_1[ib32].qs;
  1169. uint32_t aux32 = q2[2] | (q2[3] << 16);
  1170. int sumi = 0;
  1171. for (int l = 0; l < 4; ++l) {
  1172. const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
  1173. const uint8_t signs = ksigns_iq2xs[aux32 & 127];
  1174. for (int j = 0; j < 8; ++j) {
  1175. sumi += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
  1176. }
  1177. q8 += 8;
  1178. aux32 >>= 7;
  1179. }
  1180. const float d = __half2float(bq2->d) * (0.5f + aux32) * __half2float(bq8_1[ib32].ds.x) * 0.25f;
  1181. return d * sumi;
  1182. }
  1183. static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
  1184. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  1185. const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
  1186. const int ib32 = iqs;
  1187. const uint16_t * q2 = bq2->qs + 4*ib32;
  1188. const int8_t * q8 = bq8_1[ib32].qs;
  1189. const uint8_t ls1 = bq2->scales[ib32] & 0xf;
  1190. const uint8_t ls2 = bq2->scales[ib32] >> 4;
  1191. int sumi1 = 0;
  1192. for (int l = 0; l < 2; ++l) {
  1193. const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
  1194. const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
  1195. for (int j = 0; j < 8; ++j) {
  1196. sumi1 += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
  1197. }
  1198. q8 += 8;
  1199. }
  1200. int sumi2 = 0;
  1201. for (int l = 2; l < 4; ++l) {
  1202. const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
  1203. const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
  1204. for (int j = 0; j < 8; ++j) {
  1205. sumi2 += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
  1206. }
  1207. q8 += 8;
  1208. }
  1209. const float d = __half2float(bq2->d) * __half2float(bq8_1[ib32].ds.x) * 0.25f;
  1210. return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
  1211. }
  1212. static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
  1213. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  1214. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  1215. const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
  1216. const int ib32 = iqs;
  1217. const int8_t * q8 = bq8_1[ib32].qs;
  1218. const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32;
  1219. const uint8_t ls1 = bq2->scales[ib32] & 0xf;
  1220. const uint8_t ls2 = bq2->scales[ib32] >> 4;
  1221. int sumi1 = 0;
  1222. for (int l = 0; l < 2; ++l) {
  1223. const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
  1224. const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
  1225. const uint32_t signs1 = __vcmpeq4(((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201);
  1226. const int grid_l = __vsub4(grid[0] ^ signs0, signs0);
  1227. const int grid_h = __vsub4(grid[1] ^ signs1, signs1);
  1228. sumi1 = __dp4a(grid_l, *((const int *)q8 + 0), sumi1);
  1229. sumi1 = __dp4a(grid_h, *((const int *)q8 + 1), sumi1);
  1230. q8 += 8;
  1231. }
  1232. int sumi2 = 0;
  1233. for (int l = 2; l < 4; ++l) {
  1234. const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
  1235. const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
  1236. const uint32_t signs1 = __vcmpeq4(((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201);
  1237. const int grid_l = __vsub4(grid[0] ^ signs0, signs0);
  1238. const int grid_h = __vsub4(grid[1] ^ signs1, signs1);
  1239. sumi2 = __dp4a(grid_l, *((const int *)q8 + 0), sumi2);
  1240. sumi2 = __dp4a(grid_h, *((const int *)q8 + 1), sumi2);
  1241. q8 += 8;
  1242. }
  1243. const float d = __half2float(bq2->d) * __low2float(bq8_1[ib32].ds) * 0.25f;
  1244. return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
  1245. #endif
  1246. }
  1247. static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
  1248. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  1249. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  1250. const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
  1251. const int ib32 = iqs;
  1252. const uint8_t * q3 = bq2->qs + 8*ib32;
  1253. const uint16_t * gas = (const uint16_t *)(bq2->qs + QK_K/4) + 2*ib32;
  1254. const int8_t * q8 = bq8_1[ib32].qs;
  1255. uint32_t aux32 = gas[0] | (gas[1] << 16);
  1256. int sumi = 0;
  1257. for (int l = 0; l < 4; ++l) {
  1258. const uint32_t * grid1 = iq3xxs_grid + q3[2*l+0];
  1259. const uint32_t * grid2 = iq3xxs_grid + q3[2*l+1];
  1260. const uint32_t * signs = (const uint32_t *)(ksigns64 + (aux32 & 127));
  1261. const int grid_l = __vsub4(grid1[0] ^ signs[0], signs[0]);
  1262. const int grid_h = __vsub4(grid2[0] ^ signs[1], signs[1]);
  1263. sumi = __dp4a(grid_l, *((int *)q8+0), sumi);
  1264. sumi = __dp4a(grid_h, *((int *)q8+1), sumi);
  1265. q8 += 8;
  1266. aux32 >>= 7;
  1267. }
  1268. const float d = __half2float(bq2->d) * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.5f;
  1269. return d * sumi;
  1270. #endif
  1271. }
  1272. static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
  1273. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  1274. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  1275. const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
  1276. const int ib32 = iqs;
  1277. const uint8_t * qs = bq2->qs + 8*ib32;
  1278. const int8_t * q8 = bq8_1[ib32].qs;
  1279. int sumi = 0;
  1280. for (int l = 0; l < 4; ++l) {
  1281. const uint32_t * grid1 = iq3xs_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256));
  1282. const uint32_t * grid2 = iq3xs_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256));
  1283. uint32_t signs0 = __vcmpeq4(((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
  1284. uint32_t signs1 = __vcmpeq4(((bq2->signs[4*ib32+l] >> 4) * 0x01010101) & 0x08040201, 0x08040201);
  1285. const int grid_l = __vsub4(grid1[0] ^ signs0, signs0);
  1286. const int grid_h = __vsub4(grid2[0] ^ signs1, signs1);
  1287. sumi = __dp4a(grid_l, *((int *)q8+0), sumi);
  1288. sumi = __dp4a(grid_h, *((int *)q8+1), sumi);
  1289. q8 += 8;
  1290. }
  1291. const float d = __half2float(bq2->d) * (0.5f + ((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * __low2float(bq8_1[ib32].ds) * 0.5f;
  1292. return d * sumi;
  1293. #endif
  1294. }
  1295. static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
  1296. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  1297. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  1298. const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
  1299. const int ib32 = iqs;
  1300. int sumi1 = 0, sumi2 = 0, sumi3 = 0, sumi4 = 0;
  1301. const uint8_t h1 = bq1->scales[2*ib32+0];
  1302. const uint8_t h2 = bq1->scales[2*ib32+1];
  1303. const int * q8 = (const int *)bq8_1[ib32].qs;
  1304. const int * grid1 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+0] | ((h1 & 0x08) << 5)));
  1305. const int * grid2 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+1] | ((h1 & 0x80) << 1)));
  1306. const int * grid3 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+2] | ((h2 & 0x08) << 5)));
  1307. const int * grid4 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+3] | ((h2 & 0x80) << 1)));
  1308. for (int j = 0; j < 2; ++j) {
  1309. sumi1 = __dp4a(q8[j+0], grid1[j], sumi1);
  1310. sumi2 = __dp4a(q8[j+2], grid2[j], sumi2);
  1311. sumi3 = __dp4a(q8[j+4], grid3[j], sumi3);
  1312. sumi4 = __dp4a(q8[j+6], grid4[j], sumi4);
  1313. }
  1314. const float d = __half2float(bq1->d) * __low2float(bq8_1[ib32].ds);
  1315. return d * (sumi1 * (2*(h1 & 7) + 1) + sumi2 * (2*((h1 >> 4) & 7) + 1) +
  1316. sumi3 * (2*(h2 & 7) + 1) + sumi4 * (2*((h2 >> 4) & 7) + 1));
  1317. #endif
  1318. }
  1319. static __device__ __forceinline__ void get_int_from_table_16(const uint32_t & q4, const uint8_t * values,
  1320. int & val1, int & val2) {
  1321. uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32;
  1322. aux32 = q4 & 0x0f0f0f0f;
  1323. uint16_t v1 = values[q8[0]] | (values[q8[1]] << 8);
  1324. uint16_t v2 = values[q8[2]] | (values[q8[3]] << 8);
  1325. val1 = v1 | (v2 << 16);
  1326. aux32 = (q4 >> 4) & 0x0f0f0f0f;
  1327. v1 = values[q8[0]] | (values[q8[1]] << 8);
  1328. v2 = values[q8[2]] | (values[q8[3]] << 8);
  1329. val2 = v1 | (v2 << 16);
  1330. }
  1331. static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
  1332. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  1333. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  1334. const block_iq4_nl * bq = (const block_iq4_nl *) vbq;
  1335. const uint16_t * q4 = (const uint16_t *)bq->qs + 2*iqs;
  1336. const int32_t * q8 = (const int32_t *)bq8_1->qs + iqs;
  1337. const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
  1338. int v1, v2;
  1339. int sumi1 = 0, sumi2 = 0;
  1340. for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) {
  1341. const uint32_t aux = q4[2*l] | (q4[2*l+1] << 16);
  1342. get_int_from_table_16(aux, values, v1, v2);
  1343. sumi1 = __dp4a(v1, q8[l+0], sumi1);
  1344. sumi2 = __dp4a(v2, q8[l+4], sumi2);
  1345. }
  1346. const float d = __half2float(bq->d) * __low2float(bq8_1->ds);
  1347. return d * (sumi1 + sumi2);
  1348. #endif
  1349. }
  1350. static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
  1351. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  1352. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  1353. const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
  1354. const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
  1355. // iqs is 0...7
  1356. const int ib32 = iqs;
  1357. const int32_t * q8 = (const int *)bq8_1[ib32].qs;
  1358. const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32;
  1359. const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4);
  1360. const float d = __half2float(bq4->d) * (ls - 32) * __low2float(bq8_1[ib32].ds);
  1361. int v1, v2;
  1362. int sumi1 = 0, sumi2 = 0;
  1363. for (int j = 0; j < 4; ++j) {
  1364. get_int_from_table_16(q4[j], values, v1, v2);
  1365. sumi1 = __dp4a(v1, q8[j+0], sumi1);
  1366. sumi2 = __dp4a(v2, q8[j+4], sumi2);
  1367. }
  1368. return d * (sumi1 + sumi2);
  1369. #endif
  1370. }