gguf_kernel.cu 222 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944
  1. #include <cuda_fp16.h>
  2. #include <cuda_runtime.h>
  3. #include <torch/all.h>
  4. #include <torch/python.h>
  5. #include <c10/cuda/CUDAGuard.h>
  6. #define QK_K 256
  7. #define K_QUANTS_PER_ITERATION 2
  8. #define WARP_SIZE 32
  9. #define K_SCALE_SIZE 12
  10. #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
  11. #define CUDA_QUANTIZE_BLOCK_SIZE 256
  12. #define GGML_CUDA_DMMV_X 32
  13. #define GGML_CUDA_MMV_Y 1
  14. // Data Structures
  15. // QK = number of values after dequantization
  16. // QR = QK / number of values before dequantization
  17. // QI = number of 32 bit integers before dequantization
  18. #define QK4_0 32
  19. #define QR4_0 2
  20. #define QI4_0 (QK4_0 / (4 * QR4_0))
  21. typedef struct {
  22. half d; // delta
  23. uint8_t qs[QK4_0 / 2]; // nibbles / quants
  24. } block_q4_0;
  25. #define QK4_1 32
  26. #define QR4_1 2
  27. #define QI4_1 (QK4_1 / (4 * QR4_1))
  28. typedef struct {
  29. half2 dm; // dm.x = delta, dm.y = min
  30. uint8_t qs[QK4_1 / 2]; // nibbles / quants
  31. } block_q4_1;
  32. #define QK5_0 32
  33. #define QR5_0 2
  34. #define QI5_0 (QK5_0 / (4 * QR5_0))
  35. typedef struct {
  36. half d; // delta
  37. uint8_t qh[4]; // 5-th bit of quants
  38. uint8_t qs[QK5_0 / 2]; // nibbles / quants
  39. } block_q5_0;
  40. #define QK5_1 32
  41. #define QR5_1 2
  42. #define QI5_1 (QK5_1 / (4 * QR5_1))
  43. typedef struct {
  44. half2 dm; // dm.x = delta, dm.y = min
  45. uint8_t qh[4]; // 5-th bit of quants
  46. uint8_t qs[QK5_1 / 2]; // nibbles / quants
  47. } block_q5_1;
  48. #define QK8_0 32
  49. #define QR8_0 1
  50. #define QI8_0 (QK8_0 / (4 * QR8_0))
  51. typedef struct {
  52. half d; // delta
  53. int8_t qs[QK8_0]; // quants
  54. } block_q8_0;
  55. #define QK8_1 32
  56. #define QR8_1 1
  57. #define QI8_1 (QK8_1 / (4 * QR8_1))
  58. typedef struct {
  59. half2 ds; // ds.x = delta, ds.y = sum
  60. int8_t qs[QK8_0]; // quants
  61. } block_q8_1;
  62. #define QR2_K 4
  63. #define QI2_K (QK_K / (4*QR2_K))
  64. typedef struct {
  65. uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
  66. uint8_t qs[QK_K/4]; // quants
  67. half2 dm; // super-block scale for quantized scales/mins
  68. } block_q2_K;
  69. #define QR3_K 4
  70. #define QI3_K (QK_K / (4*QR3_K))
  71. typedef struct {
  72. uint8_t hmask[QK_K/8]; // quants - high bit
  73. uint8_t qs[QK_K/4]; // quants - low 2 bits
  74. uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
  75. half d; // super-block scale
  76. } block_q3_K;
  77. #define QR4_K 2
  78. #define QI4_K (QK_K / (4*QR4_K))
  79. typedef struct {
  80. half2 dm; // super-block scale for quantized scales/mins
  81. uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
  82. uint8_t qs[QK_K/2]; // 4--bit quants
  83. } block_q4_K;
  84. #define QR5_K 2
  85. #define QI5_K (QK_K / (4*QR5_K))
  86. typedef struct {
  87. half2 dm; // super-block scale for quantized scales/mins
  88. uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
  89. uint8_t qh[QK_K/8]; // quants, high bit
  90. uint8_t qs[QK_K/2]; // quants, low 4 bits
  91. } block_q5_K;
  92. #define QR6_K 2
  93. #define QI6_K (QK_K / (4*QR6_K))
  94. typedef struct {
  95. uint8_t ql[QK_K/2]; // quants, lower 4 bits
  96. uint8_t qh[QK_K/4]; // quants, upper 2 bits
  97. int8_t scales[QK_K/16]; // scales
  98. half d; // delta
  99. } block_q6_K;
  100. #define QR2_XXS 8
  101. #define QI2_XXS (QK_K / (4*QR2_XXS))
  102. typedef struct {
  103. half d;
  104. uint16_t qs[QK_K/8];
  105. } block_iq2_xxs;
  106. #define QR2_XS 8
  107. #define QI2_XS (QK_K / (4*QR2_XS))
  108. typedef struct {
  109. half d;
  110. uint16_t qs[QK_K/8];
  111. uint8_t scales[QK_K/32];
  112. } block_iq2_xs;
  113. #define QR2_S 8
  114. #define QI2_S (QK_K / (4*QR2_S))
  115. typedef struct {
  116. half d;
  117. uint8_t qs[QK_K/4];
  118. uint8_t qh[QK_K/32];
  119. uint8_t scales[QK_K/32];
  120. } block_iq2_s;
  121. #define QR3_XXS 8
  122. #define QI3_XXS (QK_K / (4*QR3_XXS))
  123. typedef struct {
  124. half d;
  125. uint8_t qs[3*(QK_K/8)];
  126. } block_iq3_xxs;
  127. #define QR3_XS 8
  128. #define QI3_XS (QK_K / (4*QR3_XS))
  129. #define IQ3S_N_SCALE QK_K/64
  130. typedef struct {
  131. half d;
  132. uint8_t qs[QK_K/4];
  133. uint8_t qh[QK_K/32];
  134. uint8_t signs[QK_K/8];
  135. uint8_t scales[IQ3S_N_SCALE];
  136. } block_iq3_s;
  137. #define QR1_S 8
  138. #define QI1_S (QK_K / (4*QR1_S))
  139. typedef struct {
  140. half d;
  141. uint8_t qs[QK_K/8];
  142. uint8_t scales[QK_K/16];
  143. } block_iq1_s;
  144. #define QK4_NL 32
  145. #define QR4_NL 2
  146. #define QI4_NL (QK4_NL / (4*QR4_NL))
  147. typedef struct {
  148. half d;
  149. uint8_t qs[QK4_NL/2];
  150. } block_iq4_nl;
  151. #define QR4_XS 8
  152. #define QI4_XS (QK_K / (4*QR4_XS))
  153. typedef struct {
  154. half d;
  155. uint16_t scales_h;
  156. uint8_t scales_l[QK_K/64];
  157. uint8_t qs[QK_K/2];
  158. } block_iq4_xs;
  159. static const __device__ uint64_t iq2xxs_grid[256] = {
  160. 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
  161. 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808,
  162. 0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819,
  163. 0x0808080819081908, 0x0808080819190808, 0x0808080819192b08, 0x08080808192b0819,
  164. 0x08080808192b1908, 0x080808082b080808, 0x080808082b08082b, 0x080808082b082b2b,
  165. 0x080808082b2b082b, 0x0808081908080819, 0x0808081908081908, 0x0808081908190808,
  166. 0x0808081908191919, 0x0808081919080808, 0x080808192b081908, 0x080808192b192b08,
  167. 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b082b082b, 0x0808082b2b08082b,
  168. 0x0808190808080819, 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819,
  169. 0x08081908082b1908, 0x0808190819080808, 0x080819081908082b, 0x0808190819082b08,
  170. 0x08081908192b0808, 0x080819082b080819, 0x080819082b081908, 0x080819082b190808,
  171. 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08,
  172. 0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19, 0x080819192b080808,
  173. 0x080819192b190819, 0x0808192b08082b19, 0x0808192b08190808, 0x0808192b19080808,
  174. 0x0808192b2b081908, 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919,
  175. 0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08, 0x08082b0819080819,
  176. 0x08082b0819081908, 0x08082b0819190808, 0x08082b081919082b, 0x08082b082b082b08,
  177. 0x08082b1908081908, 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908,
  178. 0x0819080808080819, 0x0819080808081908, 0x0819080808190808, 0x08190808082b0819,
  179. 0x0819080819080808, 0x08190808192b0808, 0x081908082b081908, 0x081908082b190808,
  180. 0x081908082b191919, 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808,
  181. 0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808, 0x0819082b082b1908,
  182. 0x0819082b19081919, 0x0819190808080808, 0x0819190808082b08, 0x08191908082b0808,
  183. 0x08191908082b1919, 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08,
  184. 0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b, 0x08192b0808080819,
  185. 0x08192b0808081908, 0x08192b0808190808, 0x08192b0819080808, 0x08192b082b080819,
  186. 0x08192b1908080808, 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819,
  187. 0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b, 0x082b080819081908,
  188. 0x082b0808192b0819, 0x082b08082b080808, 0x082b08082b08082b, 0x082b0819082b2b19,
  189. 0x082b081919082b08, 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819,
  190. 0x082b190808081908, 0x082b190808190808, 0x082b190819080808, 0x082b19081919192b,
  191. 0x082b191908080808, 0x082b191919080819, 0x082b1919192b1908, 0x082b192b2b190808,
  192. 0x082b2b0808082b08, 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908,
  193. 0x1908080808080819, 0x1908080808081908, 0x1908080808190808, 0x1908080808192b08,
  194. 0x19080808082b0819, 0x19080808082b1908, 0x1908080819080808, 0x1908080819082b08,
  195. 0x190808081919192b, 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908,
  196. 0x190808082b190808, 0x1908081908080808, 0x19080819082b0808, 0x19080819192b0819,
  197. 0x190808192b080808, 0x190808192b081919, 0x1908082b08080819, 0x1908082b08190808,
  198. 0x1908082b19082b08, 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808,
  199. 0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808, 0x190819082b192b19,
  200. 0x190819190819082b, 0x19081919082b1908, 0x1908192b08080808, 0x19082b0808080819,
  201. 0x19082b0808081908, 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919,
  202. 0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819, 0x19082b192b08082b,
  203. 0x19082b2b19081919, 0x19082b2b2b190808, 0x1919080808080808, 0x1919080808082b08,
  204. 0x1919080808190819, 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808,
  205. 0x191908082b082b08, 0x1919081908081908, 0x191908191908082b, 0x191908192b2b1908,
  206. 0x1919082b2b190819, 0x191919082b190808, 0x191919082b19082b, 0x1919191908082b2b,
  207. 0x1919192b08080819, 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819,
  208. 0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808, 0x19192b2b08082b08,
  209. 0x192b080808081908, 0x192b080808190808, 0x192b080819080808, 0x192b0808192b2b08,
  210. 0x192b081908080808, 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808,
  211. 0x192b190808080808, 0x192b190808081919, 0x192b191908190808, 0x192b19190819082b,
  212. 0x192b19192b081908, 0x192b2b081908082b, 0x2b08080808080808, 0x2b0808080808082b,
  213. 0x2b08080808082b2b, 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908,
  214. 0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819, 0x2b08190808080819,
  215. 0x2b08190808081908, 0x2b08190808190808, 0x2b08190808191919, 0x2b08190819080808,
  216. 0x2b081908192b0808, 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908,
  217. 0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808, 0x2b082b080808082b,
  218. 0x2b082b1908081908, 0x2b082b2b08190819, 0x2b19080808081908, 0x2b19080808190808,
  219. 0x2b190808082b1908, 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b,
  220. 0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808, 0x2b191908082b082b,
  221. 0x2b19190819081908, 0x2b19191919190819, 0x2b192b082b080819, 0x2b192b19082b0808,
  222. 0x2b2b08080808082b, 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19,
  223. 0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908,
  224. };
  225. static const __device__ uint64_t iq2xs_grid[512] = {
  226. 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
  227. 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
  228. 0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
  229. 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
  230. 0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
  231. 0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808,
  232. 0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819,
  233. 0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819,
  234. 0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808,
  235. 0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b,
  236. 0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b,
  237. 0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908,
  238. 0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908,
  239. 0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919,
  240. 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808,
  241. 0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919,
  242. 0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908,
  243. 0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b,
  244. 0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908,
  245. 0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08,
  246. 0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808,
  247. 0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808,
  248. 0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819,
  249. 0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908,
  250. 0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819,
  251. 0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808,
  252. 0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b,
  253. 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819,
  254. 0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819,
  255. 0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808,
  256. 0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908,
  257. 0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19,
  258. 0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b,
  259. 0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b,
  260. 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919,
  261. 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808,
  262. 0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819,
  263. 0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819,
  264. 0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b,
  265. 0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908,
  266. 0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808,
  267. 0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819,
  268. 0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808,
  269. 0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919,
  270. 0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808,
  271. 0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808,
  272. 0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908,
  273. 0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908,
  274. 0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808,
  275. 0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b,
  276. 0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819,
  277. 0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919,
  278. 0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908,
  279. 0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808,
  280. 0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908,
  281. 0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919,
  282. 0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08,
  283. 0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19,
  284. 0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b,
  285. 0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b,
  286. 0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808,
  287. 0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08,
  288. 0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b,
  289. 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908,
  290. 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b,
  291. 0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908,
  292. 0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08,
  293. 0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808,
  294. 0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808,
  295. 0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08,
  296. 0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819,
  297. 0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919,
  298. 0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808,
  299. 0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808,
  300. 0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819,
  301. 0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819,
  302. 0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908,
  303. 0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908,
  304. 0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b,
  305. 0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908,
  306. 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908,
  307. 0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908,
  308. 0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808,
  309. 0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819,
  310. 0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819,
  311. 0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819,
  312. 0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808,
  313. 0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b,
  314. 0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819,
  315. 0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819,
  316. 0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08,
  317. 0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808,
  318. 0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19,
  319. 0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919,
  320. 0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808,
  321. 0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19,
  322. 0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b,
  323. 0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808,
  324. 0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b,
  325. 0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b,
  326. 0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08,
  327. 0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b,
  328. 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808,
  329. 0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819,
  330. 0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808,
  331. 0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808,
  332. 0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08,
  333. 0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b,
  334. 0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19,
  335. 0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08,
  336. 0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919,
  337. 0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08,
  338. 0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08,
  339. 0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908,
  340. 0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908,
  341. 0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b,
  342. 0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908,
  343. 0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808,
  344. 0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b,
  345. 0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808,
  346. 0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808,
  347. 0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19,
  348. 0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08,
  349. 0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808,
  350. 0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b,
  351. 0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808,
  352. 0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b,
  353. 0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
  354. };
  355. static const __device__ uint64_t iq2s_grid[1024] = {
  356. 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
  357. 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
  358. 0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
  359. 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
  360. 0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
  361. 0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x08080808192b192b,
  362. 0x08080808192b2b19, 0x080808082b080808, 0x080808082b08082b, 0x080808082b081919,
  363. 0x080808082b082b08, 0x080808082b190819, 0x080808082b191908, 0x080808082b2b0808,
  364. 0x080808082b2b1919, 0x080808082b2b2b2b, 0x0808081908080819, 0x0808081908081908,
  365. 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, 0x080808190819082b,
  366. 0x0808081908191919, 0x0808081908192b08, 0x08080819082b0819, 0x08080819082b1908,
  367. 0x0808081919080808, 0x080808191908082b, 0x0808081919081919, 0x0808081919082b08,
  368. 0x0808081919190819, 0x0808081919191908, 0x080808191919192b, 0x0808081919192b19,
  369. 0x08080819192b0808, 0x08080819192b1919, 0x08080819192b2b08, 0x080808192b080819,
  370. 0x080808192b081908, 0x080808192b190808, 0x080808192b19082b, 0x080808192b191919,
  371. 0x080808192b2b0819, 0x080808192b2b1908, 0x0808082b08080808, 0x0808082b0808082b,
  372. 0x0808082b08081919, 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908,
  373. 0x0808082b082b0808, 0x0808082b082b2b2b, 0x0808082b19080819, 0x0808082b19081908,
  374. 0x0808082b1908192b, 0x0808082b19082b19, 0x0808082b19190808, 0x0808082b19191919,
  375. 0x0808082b2b080808, 0x0808082b2b081919, 0x0808082b2b082b2b, 0x0808082b2b191908,
  376. 0x0808082b2b2b082b, 0x0808190808080819, 0x0808190808081908, 0x080819080808192b,
  377. 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, 0x0808190808191919,
  378. 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, 0x08081908082b192b,
  379. 0x08081908082b2b19, 0x0808190819080808, 0x080819081908082b, 0x0808190819081919,
  380. 0x0808190819082b08, 0x0808190819082b2b, 0x0808190819190819, 0x0808190819191908,
  381. 0x080819081919192b, 0x0808190819192b19, 0x08081908192b0808, 0x08081908192b082b,
  382. 0x08081908192b1919, 0x080819082b080819, 0x080819082b081908, 0x080819082b08192b,
  383. 0x080819082b082b19, 0x080819082b190808, 0x080819082b191919, 0x080819082b192b08,
  384. 0x080819082b2b0819, 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b,
  385. 0x0808191908081919, 0x0808191908082b08, 0x0808191908082b2b, 0x0808191908190819,
  386. 0x0808191908191908, 0x080819190819192b, 0x0808191908192b19, 0x08081919082b0808,
  387. 0x08081919082b1919, 0x08081919082b2b08, 0x0808191919080819, 0x0808191919081908,
  388. 0x080819191908192b, 0x0808191919082b19, 0x0808191919190808, 0x080819191919082b,
  389. 0x0808191919191919, 0x0808191919192b08, 0x08081919192b0819, 0x08081919192b1908,
  390. 0x080819192b080808, 0x080819192b08082b, 0x080819192b081919, 0x080819192b082b08,
  391. 0x080819192b190819, 0x080819192b191908, 0x080819192b2b0808, 0x0808192b08080819,
  392. 0x0808192b08081908, 0x0808192b0808192b, 0x0808192b08082b19, 0x0808192b08190808,
  393. 0x0808192b08191919, 0x0808192b19080808, 0x0808192b19081919, 0x0808192b19082b08,
  394. 0x0808192b19190819, 0x0808192b19191908, 0x0808192b192b0808, 0x0808192b2b080819,
  395. 0x0808192b2b081908, 0x0808192b2b190808, 0x08082b0808080808, 0x08082b080808082b,
  396. 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808190819, 0x08082b0808191908,
  397. 0x08082b080819192b, 0x08082b0808192b19, 0x08082b08082b0808, 0x08082b08082b1919,
  398. 0x08082b08082b2b2b, 0x08082b0819080819, 0x08082b0819081908, 0x08082b081908192b,
  399. 0x08082b0819082b19, 0x08082b0819190808, 0x08082b081919082b, 0x08082b0819191919,
  400. 0x08082b0819192b08, 0x08082b08192b0819, 0x08082b08192b1908, 0x08082b082b080808,
  401. 0x08082b082b081919, 0x08082b082b191908, 0x08082b082b2b2b2b, 0x08082b1908080819,
  402. 0x08082b1908081908, 0x08082b1908190808, 0x08082b190819082b, 0x08082b1908191919,
  403. 0x08082b1908192b08, 0x08082b19082b0819, 0x08082b1919080808, 0x08082b1919081919,
  404. 0x08082b1919082b08, 0x08082b1919190819, 0x08082b1919191908, 0x08082b19192b0808,
  405. 0x08082b192b080819, 0x08082b192b190808, 0x08082b2b08080808, 0x08082b2b08190819,
  406. 0x08082b2b08191908, 0x08082b2b082b082b, 0x08082b2b082b2b08, 0x08082b2b082b2b2b,
  407. 0x08082b2b19190808, 0x08082b2b2b192b19, 0x0819080808080819, 0x0819080808081908,
  408. 0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b,
  409. 0x0819080808191919, 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908,
  410. 0x08190808082b192b, 0x0819080819080808, 0x081908081908082b, 0x0819080819081919,
  411. 0x0819080819082b08, 0x0819080819190819, 0x0819080819191908, 0x081908081919192b,
  412. 0x0819080819192b19, 0x08190808192b0808, 0x08190808192b082b, 0x08190808192b1919,
  413. 0x08190808192b2b08, 0x081908082b080819, 0x081908082b081908, 0x081908082b08192b,
  414. 0x081908082b190808, 0x081908082b191919, 0x081908082b192b08, 0x081908082b2b0819,
  415. 0x081908082b2b1908, 0x0819081908080808, 0x081908190808082b, 0x0819081908081919,
  416. 0x0819081908082b08, 0x0819081908082b2b, 0x0819081908190819, 0x0819081908191908,
  417. 0x081908190819192b, 0x0819081908192b19, 0x08190819082b0808, 0x08190819082b082b,
  418. 0x08190819082b1919, 0x08190819082b2b08, 0x0819081919080819, 0x0819081919081908,
  419. 0x081908191908192b, 0x0819081919082b19, 0x0819081919190808, 0x081908191919082b,
  420. 0x0819081919191919, 0x0819081919192b08, 0x08190819192b0819, 0x08190819192b1908,
  421. 0x081908192b080808, 0x081908192b08082b, 0x081908192b081919, 0x081908192b082b08,
  422. 0x081908192b190819, 0x081908192b191908, 0x0819082b08080819, 0x0819082b08081908,
  423. 0x0819082b08082b19, 0x0819082b08190808, 0x0819082b08191919, 0x0819082b082b0819,
  424. 0x0819082b082b1908, 0x0819082b19080808, 0x0819082b19081919, 0x0819082b19190819,
  425. 0x0819082b19191908, 0x0819082b2b080819, 0x0819082b2b081908, 0x0819082b2b190808,
  426. 0x0819190808080808, 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08,
  427. 0x0819190808190819, 0x0819190808191908, 0x081919080819192b, 0x0819190808192b19,
  428. 0x08191908082b0808, 0x08191908082b1919, 0x08191908082b2b08, 0x0819190819080819,
  429. 0x0819190819081908, 0x081919081908192b, 0x0819190819082b19, 0x0819190819190808,
  430. 0x081919081919082b, 0x0819190819191919, 0x0819190819192b08, 0x08191908192b0819,
  431. 0x08191908192b1908, 0x081919082b080808, 0x081919082b08082b, 0x081919082b081919,
  432. 0x081919082b082b08, 0x081919082b190819, 0x081919082b191908, 0x081919082b2b0808,
  433. 0x0819191908080819, 0x0819191908081908, 0x081919190808192b, 0x0819191908082b19,
  434. 0x0819191908190808, 0x081919190819082b, 0x0819191908191919, 0x0819191908192b08,
  435. 0x08191919082b0819, 0x08191919082b1908, 0x0819191919080808, 0x081919191908082b,
  436. 0x0819191919081919, 0x0819191919082b08, 0x0819191919190819, 0x0819191919191908,
  437. 0x08191919192b0808, 0x081919192b080819, 0x081919192b081908, 0x081919192b190808,
  438. 0x0819192b08080808, 0x0819192b08081919, 0x0819192b08082b08, 0x0819192b08190819,
  439. 0x0819192b08191908, 0x0819192b082b0808, 0x0819192b19080819, 0x0819192b19081908,
  440. 0x0819192b19190808, 0x0819192b2b080808, 0x0819192b2b2b2b2b, 0x08192b0808080819,
  441. 0x08192b0808081908, 0x08192b080808192b, 0x08192b0808082b19, 0x08192b0808190808,
  442. 0x08192b0808191919, 0x08192b0808192b08, 0x08192b08082b0819, 0x08192b0819080808,
  443. 0x08192b081908082b, 0x08192b0819081919, 0x08192b0819082b08, 0x08192b0819190819,
  444. 0x08192b0819191908, 0x08192b08192b0808, 0x08192b082b080819, 0x08192b082b081908,
  445. 0x08192b1908080808, 0x08192b190808082b, 0x08192b1908081919, 0x08192b1908082b08,
  446. 0x08192b1908190819, 0x08192b1908191908, 0x08192b19082b0808, 0x08192b1919080819,
  447. 0x08192b1919081908, 0x08192b1919190808, 0x08192b19192b2b19, 0x08192b192b2b082b,
  448. 0x08192b2b08081908, 0x08192b2b08190808, 0x08192b2b19080808, 0x08192b2b1919192b,
  449. 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, 0x082b080808082b08,
  450. 0x082b080808190819, 0x082b080808191908, 0x082b08080819192b, 0x082b080808192b19,
  451. 0x082b0808082b0808, 0x082b0808082b1919, 0x082b0808082b2b2b, 0x082b080819080819,
  452. 0x082b080819081908, 0x082b080819190808, 0x082b08081919082b, 0x082b080819191919,
  453. 0x082b0808192b1908, 0x082b08082b080808, 0x082b08082b082b2b, 0x082b08082b191908,
  454. 0x082b08082b2b2b2b, 0x082b081908080819, 0x082b081908081908, 0x082b081908190808,
  455. 0x082b08190819082b, 0x082b081908191919, 0x082b0819082b0819, 0x082b081919080808,
  456. 0x082b08191908082b, 0x082b081919081919, 0x082b081919190819, 0x082b081919191908,
  457. 0x082b0819192b0808, 0x082b08192b080819, 0x082b08192b081908, 0x082b08192b190808,
  458. 0x082b082b08080808, 0x082b082b08082b2b, 0x082b082b082b082b, 0x082b082b082b2b08,
  459. 0x082b082b082b2b2b, 0x082b082b19081908, 0x082b082b19190808, 0x082b082b2b082b08,
  460. 0x082b082b2b082b2b, 0x082b082b2b2b2b08, 0x082b190808080819, 0x082b190808081908,
  461. 0x082b19080808192b, 0x082b190808082b19, 0x082b190808190808, 0x082b190808191919,
  462. 0x082b190808192b08, 0x082b1908082b0819, 0x082b1908082b1908, 0x082b190819080808,
  463. 0x082b19081908082b, 0x082b190819081919, 0x082b190819082b08, 0x082b190819190819,
  464. 0x082b190819191908, 0x082b1908192b0808, 0x082b19082b080819, 0x082b19082b081908,
  465. 0x082b19082b190808, 0x082b191908080808, 0x082b191908081919, 0x082b191908082b08,
  466. 0x082b191908190819, 0x082b191908191908, 0x082b1919082b0808, 0x082b191919080819,
  467. 0x082b191919081908, 0x082b191919190808, 0x082b1919192b192b, 0x082b19192b080808,
  468. 0x082b192b08080819, 0x082b192b08081908, 0x082b192b08190808, 0x082b192b19080808,
  469. 0x082b192b19192b19, 0x082b2b0808080808, 0x082b2b0808081919, 0x082b2b0808190819,
  470. 0x082b2b0808191908, 0x082b2b0819080819, 0x082b2b0819081908, 0x082b2b0819190808,
  471. 0x082b2b082b082b2b, 0x082b2b082b2b2b2b, 0x082b2b1908080819, 0x082b2b1908081908,
  472. 0x082b2b1908190808, 0x082b2b192b191919, 0x082b2b2b08082b2b, 0x082b2b2b082b082b,
  473. 0x082b2b2b192b1908, 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819,
  474. 0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808,
  475. 0x190808080819082b, 0x1908080808191919, 0x1908080808192b08, 0x1908080808192b2b,
  476. 0x19080808082b0819, 0x19080808082b1908, 0x19080808082b192b, 0x1908080819080808,
  477. 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, 0x1908080819082b2b,
  478. 0x1908080819190819, 0x1908080819191908, 0x190808081919192b, 0x1908080819192b19,
  479. 0x19080808192b0808, 0x19080808192b082b, 0x19080808192b1919, 0x190808082b080819,
  480. 0x190808082b081908, 0x190808082b190808, 0x190808082b191919, 0x190808082b192b08,
  481. 0x190808082b2b0819, 0x190808082b2b1908, 0x1908081908080808, 0x190808190808082b,
  482. 0x1908081908081919, 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908,
  483. 0x190808190819192b, 0x1908081908192b19, 0x19080819082b0808, 0x19080819082b082b,
  484. 0x19080819082b1919, 0x1908081919080819, 0x1908081919081908, 0x190808191908192b,
  485. 0x1908081919082b19, 0x1908081919190808, 0x190808191919082b, 0x1908081919191919,
  486. 0x1908081919192b08, 0x19080819192b0819, 0x19080819192b1908, 0x190808192b080808,
  487. 0x190808192b08082b, 0x190808192b081919, 0x190808192b082b08, 0x190808192b190819,
  488. 0x190808192b191908, 0x190808192b2b0808, 0x1908082b08080819, 0x1908082b08081908,
  489. 0x1908082b08190808, 0x1908082b0819082b, 0x1908082b08191919, 0x1908082b08192b08,
  490. 0x1908082b082b1908, 0x1908082b19080808, 0x1908082b19081919, 0x1908082b19082b08,
  491. 0x1908082b19190819, 0x1908082b19191908, 0x1908082b192b0808, 0x1908082b2b080819,
  492. 0x1908082b2b081908, 0x1908190808080808, 0x190819080808082b, 0x1908190808081919,
  493. 0x1908190808082b08, 0x1908190808082b2b, 0x1908190808190819, 0x1908190808191908,
  494. 0x190819080819192b, 0x1908190808192b19, 0x19081908082b0808, 0x19081908082b082b,
  495. 0x19081908082b1919, 0x19081908082b2b08, 0x1908190819080819, 0x1908190819081908,
  496. 0x190819081908192b, 0x1908190819082b19, 0x1908190819190808, 0x190819081919082b,
  497. 0x1908190819191919, 0x1908190819192b08, 0x19081908192b0819, 0x19081908192b1908,
  498. 0x190819082b080808, 0x190819082b08082b, 0x190819082b081919, 0x190819082b082b08,
  499. 0x190819082b190819, 0x190819082b191908, 0x190819082b2b0808, 0x1908191908080819,
  500. 0x1908191908081908, 0x190819190808192b, 0x1908191908082b19, 0x1908191908190808,
  501. 0x190819190819082b, 0x1908191908191919, 0x1908191908192b08, 0x19081919082b0819,
  502. 0x19081919082b1908, 0x1908191919080808, 0x190819191908082b, 0x1908191919081919,
  503. 0x1908191919082b08, 0x1908191919190819, 0x1908191919191908, 0x19081919192b0808,
  504. 0x19081919192b2b2b, 0x190819192b080819, 0x190819192b081908, 0x190819192b190808,
  505. 0x1908192b08080808, 0x1908192b0808082b, 0x1908192b08081919, 0x1908192b08082b08,
  506. 0x1908192b08190819, 0x1908192b08191908, 0x1908192b082b0808, 0x1908192b19080819,
  507. 0x1908192b19081908, 0x1908192b19190808, 0x1908192b2b080808, 0x1908192b2b2b1919,
  508. 0x19082b0808080819, 0x19082b0808081908, 0x19082b0808082b19, 0x19082b0808190808,
  509. 0x19082b080819082b, 0x19082b0808191919, 0x19082b0808192b08, 0x19082b08082b0819,
  510. 0x19082b08082b1908, 0x19082b0819080808, 0x19082b081908082b, 0x19082b0819081919,
  511. 0x19082b0819082b08, 0x19082b0819190819, 0x19082b0819191908, 0x19082b08192b0808,
  512. 0x19082b082b081908, 0x19082b082b190808, 0x19082b1908080808, 0x19082b190808082b,
  513. 0x19082b1908081919, 0x19082b1908082b08, 0x19082b1908190819, 0x19082b1908191908,
  514. 0x19082b19082b0808, 0x19082b1919080819, 0x19082b1919081908, 0x19082b1919190808,
  515. 0x19082b192b080808, 0x19082b192b19192b, 0x19082b2b08080819, 0x19082b2b08081908,
  516. 0x19082b2b08190808, 0x19082b2b19080808, 0x1919080808080808, 0x191908080808082b,
  517. 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, 0x1919080808191908,
  518. 0x191908080819192b, 0x1919080808192b19, 0x19190808082b0808, 0x19190808082b082b,
  519. 0x19190808082b1919, 0x19190808082b2b08, 0x1919080819080819, 0x1919080819081908,
  520. 0x191908081908192b, 0x1919080819082b19, 0x1919080819190808, 0x191908081919082b,
  521. 0x1919080819191919, 0x1919080819192b08, 0x19190808192b0819, 0x19190808192b1908,
  522. 0x191908082b080808, 0x191908082b08082b, 0x191908082b081919, 0x191908082b082b08,
  523. 0x191908082b190819, 0x191908082b191908, 0x1919081908080819, 0x1919081908081908,
  524. 0x191908190808192b, 0x1919081908082b19, 0x1919081908190808, 0x191908190819082b,
  525. 0x1919081908191919, 0x1919081908192b08, 0x19190819082b0819, 0x19190819082b1908,
  526. 0x1919081919080808, 0x191908191908082b, 0x1919081919081919, 0x1919081919082b08,
  527. 0x1919081919190819, 0x1919081919191908, 0x19190819192b0808, 0x191908192b080819,
  528. 0x191908192b081908, 0x191908192b190808, 0x1919082b08080808, 0x1919082b08081919,
  529. 0x1919082b08082b08, 0x1919082b08190819, 0x1919082b08191908, 0x1919082b082b0808,
  530. 0x1919082b19080819, 0x1919082b19081908, 0x1919082b19190808, 0x1919082b192b2b19,
  531. 0x1919082b2b080808, 0x1919190808080819, 0x1919190808081908, 0x191919080808192b,
  532. 0x1919190808082b19, 0x1919190808190808, 0x191919080819082b, 0x1919190808191919,
  533. 0x1919190808192b08, 0x19191908082b0819, 0x19191908082b1908, 0x1919190819080808,
  534. 0x191919081908082b, 0x1919190819081919, 0x1919190819082b08, 0x1919190819190819,
  535. 0x1919190819191908, 0x19191908192b0808, 0x191919082b080819, 0x191919082b081908,
  536. 0x191919082b190808, 0x1919191908080808, 0x191919190808082b, 0x1919191908081919,
  537. 0x1919191908082b08, 0x1919191908190819, 0x1919191908191908, 0x19191919082b0808,
  538. 0x1919191919080819, 0x1919191919081908, 0x1919191919190808, 0x191919192b080808,
  539. 0x1919192b08080819, 0x1919192b08081908, 0x1919192b08190808, 0x1919192b082b192b,
  540. 0x1919192b19080808, 0x19192b0808080808, 0x19192b080808082b, 0x19192b0808081919,
  541. 0x19192b0808082b08, 0x19192b0808190819, 0x19192b0808191908, 0x19192b08082b0808,
  542. 0x19192b0819080819, 0x19192b0819081908, 0x19192b0819190808, 0x19192b0819192b2b,
  543. 0x19192b082b080808, 0x19192b1908080819, 0x19192b1908081908, 0x19192b1908190808,
  544. 0x19192b1919080808, 0x19192b2b08080808, 0x19192b2b08192b19, 0x19192b2b2b081919,
  545. 0x19192b2b2b2b2b08, 0x192b080808080819, 0x192b080808081908, 0x192b08080808192b,
  546. 0x192b080808190808, 0x192b08080819082b, 0x192b080808191919, 0x192b080808192b08,
  547. 0x192b0808082b0819, 0x192b0808082b1908, 0x192b080819080808, 0x192b080819081919,
  548. 0x192b080819082b08, 0x192b080819190819, 0x192b080819191908, 0x192b0808192b0808,
  549. 0x192b08082b081908, 0x192b08082b190808, 0x192b081908080808, 0x192b08190808082b,
  550. 0x192b081908081919, 0x192b081908082b08, 0x192b081908190819, 0x192b081908191908,
  551. 0x192b0819082b0808, 0x192b081919080819, 0x192b081919081908, 0x192b081919190808,
  552. 0x192b08192b080808, 0x192b08192b192b19, 0x192b082b08081908, 0x192b082b08190808,
  553. 0x192b082b19080808, 0x192b082b1919192b, 0x192b082b2b2b0819, 0x192b190808080808,
  554. 0x192b190808081919, 0x192b190808082b08, 0x192b190808190819, 0x192b190808191908,
  555. 0x192b1908082b0808, 0x192b190819080819, 0x192b190819081908, 0x192b190819190808,
  556. 0x192b19082b080808, 0x192b191908080819, 0x192b191908081908, 0x192b191908190808,
  557. 0x192b191919080808, 0x192b191919082b2b, 0x192b1919192b2b08, 0x192b19192b19082b,
  558. 0x192b192b08080808, 0x192b192b2b191908, 0x192b2b0808080819, 0x192b2b0808081908,
  559. 0x192b2b0808190808, 0x192b2b08192b1919, 0x192b2b082b192b08, 0x192b2b1908080808,
  560. 0x192b2b19082b2b2b, 0x192b2b2b1908082b, 0x192b2b2b2b2b0819, 0x2b08080808080808,
  561. 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819,
  562. 0x2b08080808191908, 0x2b08080808192b19, 0x2b080808082b0808, 0x2b080808082b1919,
  563. 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808081919082b,
  564. 0x2b08080819191919, 0x2b08080819192b08, 0x2b080808192b0819, 0x2b0808082b080808,
  565. 0x2b0808082b081919, 0x2b0808082b190819, 0x2b0808082b191908, 0x2b08081908080819,
  566. 0x2b08081908081908, 0x2b08081908082b19, 0x2b08081908190808, 0x2b0808190819082b,
  567. 0x2b08081908191919, 0x2b08081908192b08, 0x2b080819082b0819, 0x2b080819082b1908,
  568. 0x2b08081919080808, 0x2b0808191908082b, 0x2b08081919081919, 0x2b08081919082b08,
  569. 0x2b08081919190819, 0x2b08081919191908, 0x2b0808192b080819, 0x2b0808192b081908,
  570. 0x2b0808192b190808, 0x2b0808192b2b2b19, 0x2b08082b08080808, 0x2b08082b08081919,
  571. 0x2b08082b08082b2b, 0x2b08082b08190819, 0x2b08082b08191908, 0x2b08082b19080819,
  572. 0x2b08082b19081908, 0x2b08082b19190808, 0x2b08190808080819, 0x2b08190808081908,
  573. 0x2b0819080808192b, 0x2b08190808082b19, 0x2b08190808190808, 0x2b0819080819082b,
  574. 0x2b08190808191919, 0x2b08190808192b08, 0x2b081908082b0819, 0x2b08190819080808,
  575. 0x2b0819081908082b, 0x2b08190819081919, 0x2b08190819082b08, 0x2b08190819190819,
  576. 0x2b08190819191908, 0x2b081908192b0808, 0x2b0819082b080819, 0x2b0819082b081908,
  577. 0x2b0819082b190808, 0x2b08191908080808, 0x2b0819190808082b, 0x2b08191908081919,
  578. 0x2b08191908082b08, 0x2b08191908190819, 0x2b08191908191908, 0x2b081919082b0808,
  579. 0x2b08191919080819, 0x2b08191919081908, 0x2b08191919190808, 0x2b0819192b080808,
  580. 0x2b0819192b082b2b, 0x2b08192b08080819, 0x2b08192b08081908, 0x2b08192b08190808,
  581. 0x2b08192b082b2b19, 0x2b08192b19080808, 0x2b082b0808080808, 0x2b082b0808081919,
  582. 0x2b082b0808190819, 0x2b082b0808191908, 0x2b082b0819080819, 0x2b082b0819081908,
  583. 0x2b082b0819190808, 0x2b082b082b2b082b, 0x2b082b1908080819, 0x2b082b1908081908,
  584. 0x2b082b1919080808, 0x2b082b19192b1919, 0x2b082b2b082b082b, 0x2b082b2b19192b08,
  585. 0x2b082b2b19192b2b, 0x2b082b2b2b08082b, 0x2b082b2b2b2b082b, 0x2b19080808080819,
  586. 0x2b19080808081908, 0x2b19080808082b19, 0x2b19080808190808, 0x2b1908080819082b,
  587. 0x2b19080808191919, 0x2b19080808192b08, 0x2b190808082b1908, 0x2b19080819080808,
  588. 0x2b1908081908082b, 0x2b19080819081919, 0x2b19080819082b08, 0x2b19080819190819,
  589. 0x2b19080819191908, 0x2b190808192b0808, 0x2b1908082b080819, 0x2b1908082b081908,
  590. 0x2b1908082b190808, 0x2b19081908080808, 0x2b19081908081919, 0x2b19081908190819,
  591. 0x2b19081908191908, 0x2b19081919080819, 0x2b19081919081908, 0x2b19081919190808,
  592. 0x2b19081919192b2b, 0x2b19082b08080819, 0x2b19082b08081908, 0x2b19082b08190808,
  593. 0x2b19082b19080808, 0x2b19082b2b2b192b, 0x2b19190808080808, 0x2b1919080808082b,
  594. 0x2b19190808081919, 0x2b19190808082b08, 0x2b19190808190819, 0x2b19190808191908,
  595. 0x2b191908082b0808, 0x2b19190819080819, 0x2b19190819081908, 0x2b19190819190808,
  596. 0x2b1919082b080808, 0x2b1919082b19192b, 0x2b19191908080819, 0x2b19191908081908,
  597. 0x2b19191908190808, 0x2b19191919080808, 0x2b1919192b192b08, 0x2b1919192b2b0819,
  598. 0x2b19192b08080808, 0x2b19192b1908192b, 0x2b19192b192b1908, 0x2b192b0808080819,
  599. 0x2b192b0808081908, 0x2b192b0808190808, 0x2b192b08082b192b, 0x2b192b0819080808,
  600. 0x2b192b082b2b2b19, 0x2b192b1908080808, 0x2b192b1919082b19, 0x2b192b191919082b,
  601. 0x2b192b2b2b190808, 0x2b2b080808080808, 0x2b2b080808081919, 0x2b2b080808082b2b,
  602. 0x2b2b080808191908, 0x2b2b0808082b082b, 0x2b2b0808082b2b2b, 0x2b2b080819080819,
  603. 0x2b2b080819081908, 0x2b2b080819190808, 0x2b2b08082b2b082b, 0x2b2b08082b2b2b2b,
  604. 0x2b2b081919080808, 0x2b2b0819192b1919, 0x2b2b082b0808082b, 0x2b2b082b08082b2b,
  605. 0x2b2b082b082b082b, 0x2b2b082b082b2b08, 0x2b2b082b082b2b2b, 0x2b2b082b2b08082b,
  606. 0x2b2b082b2b082b08, 0x2b2b082b2b082b2b, 0x2b2b082b2b2b2b08, 0x2b2b190808080819,
  607. 0x2b2b190808081908, 0x2b2b190808190808, 0x2b2b190819080808, 0x2b2b19082b082b19,
  608. 0x2b2b19082b2b1908, 0x2b2b191908080808, 0x2b2b191908192b19, 0x2b2b192b19190819,
  609. 0x2b2b2b0808082b2b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b082b, 0x2b2b2b1919191908,
  610. 0x2b2b2b192b08192b, 0x2b2b2b2b08082b08, 0x2b2b2b2b08082b2b, 0x2b2b2b2b082b0808,
  611. 0x2b2b2b2b082b082b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b2b082b08, 0x2b2b2b2b2b2b2b2b,
  612. };
  613. static const __device__ uint32_t iq3xxs_grid[256] = {
  614. 0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
  615. 0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
  616. 0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404,
  617. 0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e,
  618. 0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c,
  619. 0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c,
  620. 0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34,
  621. 0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c,
  622. 0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c,
  623. 0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04,
  624. 0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c,
  625. 0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414,
  626. 0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434,
  627. 0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c,
  628. 0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e,
  629. 0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24,
  630. 0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24,
  631. 0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c,
  632. 0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c,
  633. 0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14,
  634. 0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414,
  635. 0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e,
  636. 0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404,
  637. 0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c,
  638. 0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c,
  639. 0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14,
  640. 0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c,
  641. 0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c,
  642. 0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14,
  643. 0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14,
  644. 0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c,
  645. 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
  646. };
  647. static const __device__ uint32_t iq3xs_grid[512] = {
  648. 0x04040404, 0x0404040c, 0x04040414, 0x0404042c, 0x0404043e, 0x04040c04, 0x04040c0c, 0x04040c14,
  649. 0x04040c24, 0x04040c34, 0x04041404, 0x0404140c, 0x0404142c, 0x04041c1c, 0x04042404, 0x04042414,
  650. 0x0404242c, 0x0404243e, 0x04042c0c, 0x04042c1c, 0x04043404, 0x04043414, 0x04043e0c, 0x04043e24,
  651. 0x04043e3e, 0x040c0404, 0x040c040c, 0x040c0414, 0x040c0424, 0x040c0c04, 0x040c0c0c, 0x040c0c2c,
  652. 0x040c1404, 0x040c141c, 0x040c143e, 0x040c1c0c, 0x040c1c2c, 0x040c2424, 0x040c340c, 0x040c342c,
  653. 0x040c3e14, 0x04140404, 0x0414040c, 0x0414042c, 0x0414043e, 0x04140c04, 0x04140c1c, 0x04140c34,
  654. 0x0414140c, 0x0414142c, 0x04141c04, 0x04141c24, 0x04142414, 0x0414242c, 0x0414243e, 0x04142c0c,
  655. 0x04142c1c, 0x04143e04, 0x04143e1c, 0x041c041c, 0x041c0c0c, 0x041c0c2c, 0x041c1404, 0x041c1414,
  656. 0x041c1c0c, 0x041c1c1c, 0x041c1c34, 0x041c2424, 0x041c2c04, 0x041c2c14, 0x041c343e, 0x041c3e0c,
  657. 0x041c3e2c, 0x04240404, 0x04240c1c, 0x04240c3e, 0x0424140c, 0x04241424, 0x04241c14, 0x04242404,
  658. 0x0424241c, 0x04242c0c, 0x04243e04, 0x042c0414, 0x042c0424, 0x042c1404, 0x042c1414, 0x042c1434,
  659. 0x042c1c1c, 0x042c240c, 0x042c242c, 0x042c243e, 0x042c3434, 0x042c3e1c, 0x04340434, 0x04340c0c,
  660. 0x04340c1c, 0x04341c0c, 0x04342c14, 0x04343e0c, 0x043e0404, 0x043e0414, 0x043e0424, 0x043e1404,
  661. 0x043e1414, 0x043e1434, 0x043e1c1c, 0x043e2c04, 0x043e2c24, 0x0c040404, 0x0c04040c, 0x0c040414,
  662. 0x0c040424, 0x0c040c04, 0x0c040c0c, 0x0c040c1c, 0x0c040c2c, 0x0c040c3e, 0x0c041404, 0x0c041414,
  663. 0x0c041c0c, 0x0c041c24, 0x0c041c34, 0x0c042c24, 0x0c042c34, 0x0c04340c, 0x0c043e14, 0x0c0c0404,
  664. 0x0c0c040c, 0x0c0c041c, 0x0c0c0434, 0x0c0c0c04, 0x0c0c0c24, 0x0c0c140c, 0x0c0c1c04, 0x0c0c1c1c,
  665. 0x0c0c240c, 0x0c0c2c04, 0x0c0c2c14, 0x0c0c3e04, 0x0c0c3e34, 0x0c140404, 0x0c140c14, 0x0c140c2c,
  666. 0x0c140c3e, 0x0c141404, 0x0c141424, 0x0c141c14, 0x0c142404, 0x0c14241c, 0x0c142c2c, 0x0c143404,
  667. 0x0c143e14, 0x0c1c040c, 0x0c1c0424, 0x0c1c043e, 0x0c1c0c04, 0x0c1c0c1c, 0x0c1c140c, 0x0c1c143e,
  668. 0x0c1c1c04, 0x0c1c1c24, 0x0c1c240c, 0x0c1c3414, 0x0c1c3e04, 0x0c24041c, 0x0c24042c, 0x0c240c14,
  669. 0x0c240c24, 0x0c241c0c, 0x0c241c1c, 0x0c242414, 0x0c242434, 0x0c242c04, 0x0c242c24, 0x0c2c040c,
  670. 0x0c2c0c04, 0x0c2c0c1c, 0x0c2c140c, 0x0c2c1c04, 0x0c2c1c14, 0x0c2c2c0c, 0x0c341404, 0x0c341424,
  671. 0x0c34143e, 0x0c342424, 0x0c342434, 0x0c3e040c, 0x0c3e041c, 0x0c3e0c04, 0x0c3e0c14, 0x0c3e140c,
  672. 0x0c3e1c2c, 0x0c3e240c, 0x0c3e3414, 0x0c3e3e04, 0x14040404, 0x1404040c, 0x1404041c, 0x1404042c,
  673. 0x1404043e, 0x14040c04, 0x14040c14, 0x14040c24, 0x14040c34, 0x1404140c, 0x1404141c, 0x1404143e,
  674. 0x14041c04, 0x14041c14, 0x1404240c, 0x1404241c, 0x1404242c, 0x14042c04, 0x14042c14, 0x1404343e,
  675. 0x14043e04, 0x14043e1c, 0x14043e2c, 0x140c0404, 0x140c0414, 0x140c0c04, 0x140c0c1c, 0x140c0c3e,
  676. 0x140c1414, 0x140c142c, 0x140c1c0c, 0x140c1c24, 0x140c2414, 0x140c2c0c, 0x1414040c, 0x14140424,
  677. 0x1414043e, 0x1414140c, 0x1414141c, 0x14141c04, 0x14141c3e, 0x1414240c, 0x14142c1c, 0x14142c3e,
  678. 0x14143e0c, 0x14143e24, 0x141c0404, 0x141c0414, 0x141c042c, 0x141c0c0c, 0x141c1414, 0x141c1424,
  679. 0x141c1c0c, 0x141c1c1c, 0x141c2414, 0x141c2c04, 0x141c3434, 0x1424040c, 0x1424043e, 0x14241404,
  680. 0x1424141c, 0x14241c14, 0x14241c2c, 0x1424240c, 0x14243e14, 0x14243e2c, 0x142c0424, 0x142c0c0c,
  681. 0x142c1414, 0x142c1c3e, 0x142c2404, 0x142c2c1c, 0x142c3e04, 0x14340404, 0x14340414, 0x1434043e,
  682. 0x1434140c, 0x14342c2c, 0x1434340c, 0x143e042c, 0x143e0c0c, 0x143e1434, 0x143e1c04, 0x143e241c,
  683. 0x143e2c04, 0x1c040414, 0x1c040c0c, 0x1c040c1c, 0x1c040c2c, 0x1c040c3e, 0x1c041414, 0x1c041c0c,
  684. 0x1c041c1c, 0x1c041c2c, 0x1c042414, 0x1c042424, 0x1c04243e, 0x1c042c0c, 0x1c04341c, 0x1c043e0c,
  685. 0x1c0c040c, 0x1c0c041c, 0x1c0c042c, 0x1c0c0c24, 0x1c0c140c, 0x1c0c141c, 0x1c0c2404, 0x1c0c3404,
  686. 0x1c0c3e14, 0x1c0c3e34, 0x1c140404, 0x1c140c14, 0x1c141404, 0x1c141c14, 0x1c141c24, 0x1c142c04,
  687. 0x1c1c040c, 0x1c1c0c04, 0x1c1c0c24, 0x1c1c140c, 0x1c1c141c, 0x1c1c143e, 0x1c1c1c04, 0x1c1c240c,
  688. 0x1c1c241c, 0x1c1c243e, 0x1c1c2c2c, 0x1c1c3e1c, 0x1c24041c, 0x1c240c0c, 0x1c240c34, 0x1c241414,
  689. 0x1c241c0c, 0x1c242c14, 0x1c243404, 0x1c243424, 0x1c2c040c, 0x1c2c0c04, 0x1c2c0c14, 0x1c2c142c,
  690. 0x1c2c1c14, 0x1c2c2424, 0x1c2c2c34, 0x1c2c3e1c, 0x1c340c34, 0x1c34240c, 0x1c3e040c, 0x1c3e041c,
  691. 0x1c3e1404, 0x1c3e1414, 0x1c3e1c2c, 0x24040404, 0x24040424, 0x24040c14, 0x24041404, 0x24041424,
  692. 0x2404143e, 0x24041c14, 0x2404240c, 0x24042c04, 0x24043e04, 0x240c0414, 0x240c043e, 0x240c0c0c,
  693. 0x240c0c1c, 0x240c1414, 0x240c1c04, 0x240c1c2c, 0x240c241c, 0x240c2c0c, 0x240c2c2c, 0x2414040c,
  694. 0x2414041c, 0x24140c04, 0x24140c2c, 0x2414140c, 0x24141c1c, 0x24142404, 0x24142c3e, 0x24143414,
  695. 0x24143e04, 0x241c0424, 0x241c0c0c, 0x241c0c1c, 0x241c1404, 0x241c1414, 0x241c1c0c, 0x241c1c2c,
  696. 0x24240404, 0x24240414, 0x24241424, 0x24241c3e, 0x24242404, 0x24243e0c, 0x242c042c, 0x242c043e,
  697. 0x242c140c, 0x242c3414, 0x24340c1c, 0x24341c24, 0x24343404, 0x243e0c04, 0x243e0c2c, 0x243e1c04,
  698. 0x243e241c, 0x243e2c0c, 0x2c040414, 0x2c040c04, 0x2c040c24, 0x2c041414, 0x2c042404, 0x2c042424,
  699. 0x2c04243e, 0x2c042c14, 0x2c043434, 0x2c043e24, 0x2c0c040c, 0x2c0c041c, 0x2c0c042c, 0x2c0c0c14,
  700. 0x2c0c140c, 0x2c0c1c14, 0x2c0c3e14, 0x2c140404, 0x2c140c0c, 0x2c14141c, 0x2c141c04, 0x2c141c34,
  701. 0x2c142c1c, 0x2c1c0414, 0x2c1c043e, 0x2c1c0c04, 0x2c1c143e, 0x2c1c2424, 0x2c1c2c0c, 0x2c1c342c,
  702. 0x2c1c3e1c, 0x2c24040c, 0x2c240424, 0x2c241404, 0x2c241c14, 0x2c242434, 0x2c2c0c14, 0x2c2c1434,
  703. 0x2c2c2c0c, 0x2c2c2c1c, 0x2c342414, 0x2c3e0414, 0x2c3e0424, 0x2c3e1414, 0x34040c0c, 0x34040c1c,
  704. 0x34040c2c, 0x34041c0c, 0x34041c1c, 0x34043404, 0x340c0404, 0x340c1404, 0x340c143e, 0x340c3424,
  705. 0x34140c14, 0x34141c24, 0x34142414, 0x34142c2c, 0x34143414, 0x34143e04, 0x341c0404, 0x341c0c24,
  706. 0x341c140c, 0x341c2404, 0x3424142c, 0x3424241c, 0x34243414, 0x342c0404, 0x342c041c, 0x342c1c24,
  707. 0x342c3404, 0x3434042c, 0x34342404, 0x343e0c0c, 0x343e0c1c, 0x3e040404, 0x3e040424, 0x3e04043e,
  708. 0x3e041404, 0x3e041414, 0x3e041c34, 0x3e042404, 0x3e042c24, 0x3e043414, 0x3e0c0414, 0x3e0c0c0c,
  709. 0x3e0c1424, 0x3e0c241c, 0x3e0c242c, 0x3e14040c, 0x3e140424, 0x3e140c04, 0x3e140c34, 0x3e14140c,
  710. 0x3e141c04, 0x3e142c0c, 0x3e1c0414, 0x3e1c1c14, 0x3e1c1c2c, 0x3e1c2c1c, 0x3e24040c, 0x3e24042c,
  711. 0x3e240c1c, 0x3e241404, 0x3e242c04, 0x3e2c1414, 0x3e2c2414, 0x3e340414, 0x3e341c0c, 0x3e3e0404,
  712. };
  713. static const __device__ uint64_t iq1s_grid[512] = {
  714. 0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000,
  715. 0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01,
  716. 0xffffff00ff000000, 0xffffff000000ff00, 0xffffff00000000ff, 0xffffff0000000100,
  717. 0xffffff0000010000, 0xffffff0001000000, 0xffffff01ffff00ff, 0xffffff01ff01ff00,
  718. 0xffffff01ff010100, 0xffffff0100000001, 0xffffff0101ffff00, 0xffffff0101ff0101,
  719. 0xffffff0101010100, 0xffff00ffff00ff01, 0xffff00ffff0000ff, 0xffff00ff00ff0100,
  720. 0xffff00ff0100ff00, 0xffff00ff010001ff, 0xffff0000ff0101ff, 0xffff000000ffff00,
  721. 0xffff000000000000, 0xffff00000001ff01, 0xffff000001000101, 0xffff0000010100ff,
  722. 0xffff0001ffff0100, 0xffff00010000ff00, 0xffff000100010101, 0xffff000101000000,
  723. 0xffff01ffffff0000, 0xffff01ffff01ffff, 0xffff01ffff010100, 0xffff01ff00000000,
  724. 0xffff01ff01ffffff, 0xffff01ff01ff0001, 0xffff01ff0101ffff, 0xffff01ff01010001,
  725. 0xffff0100ffffff01, 0xffff01000000ffff, 0xffff010000000100, 0xffff010001ff01ff,
  726. 0xffff010001000000, 0xffff0101ff000000, 0xffff0101000101ff, 0xffff010101ffff01,
  727. 0xffff01010101ff00, 0xff00ffffff000000, 0xff00ffff00ffff00, 0xff00ffff00000001,
  728. 0xff00ffff000001ff, 0xff00ffff01010000, 0xff00ff00ffff0000, 0xff00ff00ff00ff00,
  729. 0xff00ff00ff0000ff, 0xff00ff00ff000100, 0xff00ff00ff010001, 0xff00ff0000ff0001,
  730. 0xff00ff000000ffff, 0xff00ff0000000000, 0xff00ff000001ff00, 0xff00ff0000010100,
  731. 0xff00ff0001ff0000, 0xff00ff000100ff00, 0xff00ff0001000100, 0xff00ff01ff000000,
  732. 0xff00ff0100ff0000, 0xff00ff01000001ff, 0xff00ff0101010001, 0xff0000ff00000000,
  733. 0xff0000ff0001ff00, 0xff0000ff00010100, 0xff000000ffff0101, 0xff000000ff000000,
  734. 0xff000000ff01ff00, 0xff00000000ff0000, 0xff0000000000ff00, 0xff000000000000ff,
  735. 0xff00000000000000, 0xff00000000000001, 0xff00000000000100, 0xff0000000001ffff,
  736. 0xff00000000010000, 0xff00000001000000, 0xff00000001010100, 0xff000001ff00ff01,
  737. 0xff000001ff0100ff, 0xff00000100000000, 0xff0000010001ff00, 0xff00000101ff0100,
  738. 0xff0000010100ff00, 0xff0001ff00ff00ff, 0xff0001ff00000101, 0xff0001ff000100ff,
  739. 0xff0001ff01000000, 0xff000100ff0001ff, 0xff0001000000ff01, 0xff00010000000000,
  740. 0xff00010000010001, 0xff00010000010100, 0xff00010001ffff00, 0xff00010001ff0101,
  741. 0xff00010001010000, 0xff000101ffffffff, 0xff000101ff000101, 0xff00010101ff00ff,
  742. 0xff00010101000001, 0xff000101010100ff, 0xff01ffffff000101, 0xff01ffffff01ffff,
  743. 0xff01ffffff01ff01, 0xff01ffffff0101ff, 0xff01ffff00000000, 0xff01ffff01ff0001,
  744. 0xff01ffff0101ff01, 0xff01ff00ff000000, 0xff01ff0000ff0100, 0xff01ff000000ff01,
  745. 0xff01ff0000010000, 0xff01ff00010000ff, 0xff01ff01ff01ff00, 0xff01ff0100000101,
  746. 0xff0100ffffff0000, 0xff0100ffff010000, 0xff0100ff01ff00ff, 0xff0100ff01000100,
  747. 0xff0100ff010100ff, 0xff010000ffffff01, 0xff01000000000000, 0xff0100000101ff00,
  748. 0xff010001ffff00ff, 0xff010001ff000100, 0xff01000100ffff00, 0xff01000100010001,
  749. 0xff01000101ff0001, 0xff010001010001ff, 0xff0101ffffffffff, 0xff0101ffff01ffff,
  750. 0xff0101ffff010101, 0xff0101ff0000ff00, 0xff0101ff01010001, 0xff010100ff000000,
  751. 0xff010100ff01ff01, 0xff01010000ff0001, 0xff01010000000100, 0xff01010001000000,
  752. 0xff0101010100ffff, 0x00ffffff0000ff01, 0x00ffffff000000ff, 0x00ffffff00000100,
  753. 0x00ffffff00010000, 0x00ffff00ffff0001, 0x00ffff00ff0000ff, 0x00ffff00ff000100,
  754. 0x00ffff0000000000, 0x00ffff0001000100, 0x00ffff0001010001, 0x00ffff01ff00ff01,
  755. 0x00ffff0100ff0100, 0x00ffff010000ff00, 0x00ffff01000100ff, 0x00ffff0101ff00ff,
  756. 0x00ffff010101ff00, 0x00ff00ffffffffff, 0x00ff00ffffff01ff, 0x00ff00ffff000101,
  757. 0x00ff00ff00000000, 0x00ff00ff000101ff, 0x00ff00ff01010101, 0x00ff0000ff000000,
  758. 0x00ff0000ff01ffff, 0x00ff000000ff0000, 0x00ff00000000ff00, 0x00ff0000000000ff,
  759. 0x00ff000000000000, 0x00ff000000000001, 0x00ff000000000100, 0x00ff000000010000,
  760. 0x00ff000001ffff01, 0x00ff000001000000, 0x00ff0001ff000101, 0x00ff000100ffffff,
  761. 0x00ff000100000000, 0x00ff0001010001ff, 0x00ff01ffff000000, 0x00ff01ff0001ff00,
  762. 0x00ff01ff01ff0100, 0x00ff0100ff01ff01, 0x00ff010000ff00ff, 0x00ff010000ff0101,
  763. 0x00ff010000000000, 0x00ff010000010101, 0x00ff01000100ff00, 0x00ff010001010000,
  764. 0x00ff0101ffffff00, 0x00ff01010000ff01, 0x00ff010100000100, 0x00ff010101ff0000,
  765. 0x0000ffffffff0100, 0x0000ffffff00ff00, 0x0000ffffff0000ff, 0x0000ffffff010000,
  766. 0x0000ffff00000000, 0x0000ffff00010101, 0x0000ffff01ffff01, 0x0000ffff01000100,
  767. 0x0000ff00ff000000, 0x0000ff00ff01ff00, 0x0000ff00ff0101ff, 0x0000ff0000ff0000,
  768. 0x0000ff000000ff00, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001,
  769. 0x0000ff0000000100, 0x0000ff0000010000, 0x0000ff0001ffffff, 0x0000ff0001ff01ff,
  770. 0x0000ff0001000000, 0x0000ff000101ffff, 0x0000ff01ffff0101, 0x0000ff01ff010000,
  771. 0x0000ff0100000000, 0x0000ff0101000101, 0x000000ffffff0001, 0x000000ffff000000,
  772. 0x000000ff00ff0000, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000,
  773. 0x000000ff00000001, 0x000000ff00000100, 0x000000ff00010000, 0x000000ff01000000,
  774. 0x000000ff0101ff00, 0x00000000ffff0000, 0x00000000ff00ff00, 0x00000000ff0000ff,
  775. 0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff010000,
  776. 0x0000000000ffff00, 0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001,
  777. 0x0000000000ff0100, 0x000000000000ffff, 0x000000000000ff00, 0x000000000000ff01,
  778. 0x00000000000000ff, 0x0000000000000001, 0x00000000000001ff, 0x0000000000000100,
  779. 0x0000000000000101, 0x000000000001ff00, 0x00000000000100ff, 0x0000000000010000,
  780. 0x0000000000010001, 0x0000000000010100, 0x0000000001ff0000, 0x000000000100ff00,
  781. 0x00000000010000ff, 0x0000000001000000, 0x0000000001000001, 0x0000000001000100,
  782. 0x0000000001010000, 0x00000001ffff01ff, 0x00000001ff000000, 0x0000000100ff0000,
  783. 0x000000010000ff00, 0x00000001000000ff, 0x0000000100000000, 0x0000000100000001,
  784. 0x0000000100000100, 0x0000000100010000, 0x0000000101000000, 0x000001ffff00ff00,
  785. 0x000001ffff010001, 0x000001ffff0101ff, 0x000001ff00ffff01, 0x000001ff0000ffff,
  786. 0x000001ff00000000, 0x000001ff010000ff, 0x000001ff01010100, 0x00000100ffff0100,
  787. 0x00000100ff000000, 0x0000010000ff0000, 0x000001000000ff00, 0x00000100000000ff,
  788. 0x0000010000000000, 0x0000010000000001, 0x0000010000000100, 0x0000010000010000,
  789. 0x0000010001000000, 0x000001000101ff01, 0x00000101ffff0001, 0x00000101ff01ffff,
  790. 0x0000010100000000, 0x0000010101010100, 0x0001ffffff000000, 0x0001ffff00ffffff,
  791. 0x0001ffff00000100, 0x0001ffff0001ff00, 0x0001ffff01000000, 0x0001ff00ffffff00,
  792. 0x0001ff00ffff01ff, 0x0001ff00ff010000, 0x0001ff0000000000, 0x0001ff0000010001,
  793. 0x0001ff0001ff0000, 0x0001ff0001010100, 0x0001ff01ff0000ff, 0x0001ff01ff000001,
  794. 0x0001ff0100ffffff, 0x0001ff010001ffff, 0x0001ff01000101ff, 0x0001ff010100ff01,
  795. 0x000100ffff00ffff, 0x000100ffff00ff01, 0x000100ffff000100, 0x000100ff00000000,
  796. 0x000100ff000101ff, 0x000100ff01ff0101, 0x000100ff0100ffff, 0x000100ff01010101,
  797. 0x00010000ff000000, 0x00010000ff010100, 0x0001000000ff0000, 0x000100000000ff00,
  798. 0x00010000000000ff, 0x0001000000000000, 0x0001000000000001, 0x0001000000000100,
  799. 0x0001000000010000, 0x0001000001ffff01, 0x0001000001000000, 0x0001000100ff0101,
  800. 0x0001000100000000, 0x00010001010100ff, 0x000101ffffff01ff, 0x000101ffffff0101,
  801. 0x000101ff00010000, 0x000101ff01ff0000, 0x000101ff0100ff01, 0x00010100ffff0000,
  802. 0x0001010000000000, 0x000101000001ffff, 0x0001010000010101, 0x00010100010001ff,
  803. 0x00010101ff00ff00, 0x00010101ff010001, 0x0001010100ffffff, 0x0001010100ff01ff,
  804. 0x00010101000101ff, 0x0001010101ff0000, 0x000101010100ff01, 0x0001010101000101,
  805. 0x01ffffffffff0101, 0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff,
  806. 0x01ffffffff010101, 0x01ffffff00000000, 0x01ffffff01ff01ff, 0x01ffffff01000101,
  807. 0x01ffffff0101ff01, 0x01ffffff010100ff, 0x01ffff000000ff00, 0x01ffff0000000001,
  808. 0x01ffff00000001ff, 0x01ffff0000010000, 0x01ffff0001ff0000, 0x01ffff01ffffffff,
  809. 0x01ffff01ffff01ff, 0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff0101ff,
  810. 0x01ffff010100ffff, 0x01ff00ffffff0000, 0x01ff00ffff010000, 0x01ff00ff00ffff01,
  811. 0x01ff0000ff0000ff, 0x01ff000000000000, 0x01ff00000001ff01, 0x01ff000001ffffff,
  812. 0x01ff000001010100, 0x01ff0001ffffff01, 0x01ff0001ff010001, 0x01ff000101ff0100,
  813. 0x01ff000101000001, 0x01ff0001010100ff, 0x01ff01ffff00ffff, 0x01ff01ff00010001,
  814. 0x01ff01ff01000000, 0x01ff01ff010101ff, 0x01ff0100ff000001, 0x01ff010000ffff00,
  815. 0x01ff010000000100, 0x01ff010001ff01ff, 0x01ff01000101ffff, 0x01ff0101ffff00ff,
  816. 0x01ff0101ffff0101, 0x01ff0101ff0101ff, 0x01ff010100010000, 0x0100ffff00ff00ff,
  817. 0x0100ffff00ff0001, 0x0100ffff00000100, 0x0100ffff0100ff00, 0x0100ff00ffff0000,
  818. 0x0100ff00ff00ffff, 0x0100ff00ff00ff01, 0x0100ff00ff000100, 0x0100ff00ff010000,
  819. 0x0100ff0000000000, 0x0100ff00000100ff, 0x0100ff0001ff0101, 0x0100ff0001010101,
  820. 0x0100ff0100ff00ff, 0x0100ff0100ff0001, 0x0100ff0100000100, 0x0100ff0100010001,
  821. 0x0100ff0101000000, 0x010000ffff00ff00, 0x010000ff0000ffff, 0x010000ff00000000,
  822. 0x010000ff010001ff, 0x010000ff01010001, 0x01000000ffffff00, 0x01000000ffff0101,
  823. 0x01000000ff000000, 0x01000000ff0100ff, 0x01000000ff010101, 0x0100000000ff0000,
  824. 0x010000000000ff00, 0x01000000000000ff, 0x0100000000000000, 0x0100000000000001,
  825. 0x0100000000000100, 0x0100000000010000, 0x0100000001000000, 0x0100000100000000,
  826. 0x01000001000101ff, 0x0100000101ffff01, 0x010001ffff000101, 0x010001ff00ff0100,
  827. 0x010001ff0000ff00, 0x010001ff000100ff, 0x010001ff01ffffff, 0x01000100ffff0000,
  828. 0x01000100ff0001ff, 0x0100010000000000, 0x010001000001ff00, 0x0100010001ff0000,
  829. 0x01000100010000ff, 0x0100010001000101, 0x01000101ff00ff01, 0x0100010100ff0100,
  830. 0x010001010000ffff, 0x0100010101010001, 0x0101ffffffff0101, 0x0101ffffff0001ff,
  831. 0x0101ffffff01ffff, 0x0101ffffff010101, 0x0101ffff00000000, 0x0101ffff0101ffff,
  832. 0x0101ffff010101ff, 0x0101ff00ff000000, 0x0101ff0000ff0100, 0x0101ff000000ff00,
  833. 0x0101ff0000010000, 0x0101ff00010000ff, 0x0101ff0001000001, 0x0101ff01ff010101,
  834. 0x0101ff0100000000, 0x0101ff010101ff00, 0x010100ffffff0000, 0x010100ffff010000,
  835. 0x010100ff00ff01ff, 0x010100ff000000ff, 0x010100ff00000101, 0x010100ff01ffff00,
  836. 0x01010000ffffff01, 0x01010000ff000100, 0x01010000ff01ff01, 0x0101000000000000,
  837. 0x01010000000100ff, 0x010100000101ff01, 0x01010001ffff0000, 0x01010001ff00ffff,
  838. 0x01010001ff010000, 0x0101000101ffffff, 0x0101000101ff01ff, 0x0101000101010101,
  839. 0x010101ffff01ffff, 0x010101ff00000000, 0x010101ff0001ff01, 0x010101ff0101ffff,
  840. 0x010101ff010101ff, 0x01010100ffffffff, 0x01010100ff000001, 0x010101000000ff00,
  841. 0x0101010001010000, 0x0101010100ff0001, 0x010101010001ff01, 0x010101010101ffff,
  842. };
  843. static const __device__ uint8_t ksigns_iq2xs[128] = {
  844. 0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15,
  845. 144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159,
  846. 160, 33, 34, 163, 36, 165, 166, 39, 40, 169, 170, 43, 172, 45, 46, 175,
  847. 48, 177, 178, 51, 180, 53, 54, 183, 184, 57, 58, 187, 60, 189, 190, 63,
  848. 192, 65, 66, 195, 68, 197, 198, 71, 72, 201, 202, 75, 204, 77, 78, 207,
  849. 80, 209, 210, 83, 212, 85, 86, 215, 216, 89, 90, 219, 92, 221, 222, 95,
  850. 96, 225, 226, 99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111,
  851. 240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
  852. };
  853. static const __device__ uint64_t ksigns64[128] = {
  854. 0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff,
  855. 0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff,
  856. 0xff000000ff000000, 0x00000000ff0000ff, 0x00000000ff00ff00, 0xff000000ff00ffff,
  857. 0x00000000ffff0000, 0xff000000ffff00ff, 0xff000000ffffff00, 0x00000000ffffffff,
  858. 0xff0000ff00000000, 0x000000ff000000ff, 0x000000ff0000ff00, 0xff0000ff0000ffff,
  859. 0x000000ff00ff0000, 0xff0000ff00ff00ff, 0xff0000ff00ffff00, 0x000000ff00ffffff,
  860. 0x000000ffff000000, 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0x000000ffff00ffff,
  861. 0xff0000ffffff0000, 0x000000ffffff00ff, 0x000000ffffffff00, 0xff0000ffffffffff,
  862. 0xff00ff0000000000, 0x0000ff00000000ff, 0x0000ff000000ff00, 0xff00ff000000ffff,
  863. 0x0000ff0000ff0000, 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0x0000ff0000ffffff,
  864. 0x0000ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 0x0000ff00ff00ffff,
  865. 0xff00ff00ffff0000, 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0xff00ff00ffffffff,
  866. 0x0000ffff00000000, 0xff00ffff000000ff, 0xff00ffff0000ff00, 0x0000ffff0000ffff,
  867. 0xff00ffff00ff0000, 0x0000ffff00ff00ff, 0x0000ffff00ffff00, 0xff00ffff00ffffff,
  868. 0xff00ffffff000000, 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0xff00ffffff00ffff,
  869. 0x0000ffffffff0000, 0xff00ffffffff00ff, 0xff00ffffffffff00, 0x0000ffffffffffff,
  870. 0xffff000000000000, 0x00ff0000000000ff, 0x00ff00000000ff00, 0xffff00000000ffff,
  871. 0x00ff000000ff0000, 0xffff000000ff00ff, 0xffff000000ffff00, 0x00ff000000ffffff,
  872. 0x00ff0000ff000000, 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0x00ff0000ff00ffff,
  873. 0xffff0000ffff0000, 0x00ff0000ffff00ff, 0x00ff0000ffffff00, 0xffff0000ffffffff,
  874. 0x00ff00ff00000000, 0xffff00ff000000ff, 0xffff00ff0000ff00, 0x00ff00ff0000ffff,
  875. 0xffff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 0xffff00ff00ffffff,
  876. 0xffff00ffff000000, 0x00ff00ffff0000ff, 0x00ff00ffff00ff00, 0xffff00ffff00ffff,
  877. 0x00ff00ffffff0000, 0xffff00ffffff00ff, 0xffff00ffffffff00, 0x00ff00ffffffffff,
  878. 0x00ffff0000000000, 0xffffff00000000ff, 0xffffff000000ff00, 0x00ffff000000ffff,
  879. 0xffffff0000ff0000, 0x00ffff0000ff00ff, 0x00ffff0000ffff00, 0xffffff0000ffffff,
  880. 0xffffff00ff000000, 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0xffffff00ff00ffff,
  881. 0x00ffff00ffff0000, 0xffffff00ffff00ff, 0xffffff00ffffff00, 0x00ffff00ffffffff,
  882. 0xffffffff00000000, 0x00ffffff000000ff, 0x00ffffff0000ff00, 0xffffffff0000ffff,
  883. 0x00ffffff00ff0000, 0xffffffff00ff00ff, 0xffffffff00ffff00, 0x00ffffff00ffffff,
  884. 0x00ffffffff000000, 0xffffffffff0000ff, 0xffffffffff00ff00, 0x00ffffffff00ffff,
  885. 0xffffffffffff0000, 0x00ffffffffff00ff, 0x00ffffffffffff00, 0xffffffffffffffff,
  886. };
  887. static const __device__ uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128};
  888. static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
  889. typedef half dfloat; // dequantize float
  890. typedef half2 dfloat2;
  891. typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
  892. typedef void (*to_fp16_cuda_t)(const void * __restrict__ x, dfloat * __restrict__ y, int k, cudaStream_t stream);
  893. typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
  894. typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
  895. typedef void (*load_tiles_cuda_t)(
  896. const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
  897. int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row);
  898. typedef float (*vec_dot_q_mul_mat_cuda_t)(
  899. const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
  900. const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k);
  901. // Utility function
  902. #if defined(USE_ROCM)
  903. #ifndef __has_builtin
  904. #define __has_builtin(x) 0
  905. #endif
  906. typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
  907. static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
  908. const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
  909. const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
  910. #if __has_builtin(__builtin_elementwise_sub_sat)
  911. const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
  912. return reinterpret_cast<const int &>(c);
  913. #else
  914. int8x4_t c;
  915. int16_t tmp;
  916. #pragma unroll
  917. for (int i = 0; i < 4; i++) {
  918. tmp = va[i] - vb[i];
  919. if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
  920. if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
  921. c[i] = tmp;
  922. }
  923. return reinterpret_cast<int &>(c);
  924. #endif // __has_builtin(__builtin_elementwise_sub_sat)
  925. }
  926. static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
  927. #if __has_builtin(__builtin_amdgcn_sdot4)
  928. c = __builtin_amdgcn_sdot4(a, b, c, false);
  929. #else
  930. const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
  931. const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
  932. c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
  933. #endif
  934. return c;
  935. }
  936. #endif // defined(USE_ROCM)
  937. static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
  938. const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
  939. int x32 = 0;
  940. x32 |= x16[0] << 0;
  941. x32 |= x16[1] << 16;
  942. return x32;
  943. }
  944. static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
  945. const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
  946. int x32 = 0;
  947. x32 |= x16[0] << 0;
  948. x32 |= x16[1] << 16;
  949. return x32;
  950. }
  951. static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
  952. return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
  953. }
  954. static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
  955. return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
  956. }
  957. // Dequant functions
  958. static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
  959. const block_q4_0 * x = (const block_q4_0 *) vx;
  960. const dfloat d = x[ib].d;
  961. const int vui = x[ib].qs[iqs];
  962. v.x = __int2half_rn(vui & 0xF);
  963. v.y = __int2half_rn(vui >> 4);
  964. v = __hsub2(v, __floats2half2_rn(8.0f, 8.0f));
  965. v = __hmul2(v, {d, d});
  966. }
  967. static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
  968. const block_q4_1 * x = (const block_q4_1 *) vx;
  969. const dfloat d = __low2half(x[ib].dm);
  970. const dfloat m = __high2half(x[ib].dm);
  971. const int vui = x[ib].qs[iqs];
  972. v.x = __int2half_rn(vui & 0xF);
  973. v.y = __int2half_rn(vui >> 4);
  974. v = __hmul2(v, {d, d});
  975. v = __hadd2(v, {m, m});
  976. }
  977. static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
  978. const block_q5_0 * x = (const block_q5_0 *) vx;
  979. const dfloat d = x[ib].d;
  980. uint32_t qh;
  981. memcpy(&qh, x[ib].qh, sizeof(qh));
  982. const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
  983. const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
  984. v.x = __int2half_rn((x[ib].qs[iqs] & 0xf) | xh_0);
  985. v.y = __int2half_rn((x[ib].qs[iqs] >> 4) | xh_1);
  986. v = __hsub2(v, __floats2half2_rn(16.0f, 16.0f));
  987. v = __hmul2(v, {d, d});
  988. }
  989. static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
  990. const block_q5_1 * x = (const block_q5_1 *) vx;
  991. const dfloat d = __low2half(x[ib].dm);
  992. const dfloat m = __high2half(x[ib].dm);
  993. uint32_t qh;
  994. memcpy(&qh, x[ib].qh, sizeof(qh));
  995. const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
  996. const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
  997. v.x = __int2half_rn((x[ib].qs[iqs] & 0xf) | xh_0);
  998. v.y = __int2half_rn((x[ib].qs[iqs] >> 4) | xh_1);
  999. v = __hmul2(v, {d, d});
  1000. v = __hadd2(v, {m, m});
  1001. }
  1002. static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
  1003. const block_q8_0 * x = (const block_q8_0 *) vx;
  1004. const dfloat d = x[ib].d;
  1005. v.x = __int2half_rn(x[ib].qs[iqs + 0]);
  1006. v.y = __int2half_rn(x[ib].qs[iqs + 1]);
  1007. v = __hmul2(v, {d, d});
  1008. }
  1009. template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
  1010. static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
  1011. const int i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
  1012. if (i >= k) {
  1013. return;
  1014. }
  1015. const int ib = i/qk; // block index
  1016. const int iqs = (i%qk)/qr; // quant index
  1017. const int iybs = i - i%qk; // y block start index
  1018. const int y_offset = qr == 1 ? 1 : qk/2;
  1019. // dequantize
  1020. dfloat2 v;
  1021. dequantize_kernel(vx, ib, iqs, v);
  1022. y[iybs + iqs + 0] = v.x;
  1023. y[iybs + iqs + y_offset] = v.y;
  1024. }
  1025. template<typename dst_t>
  1026. static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
  1027. const int i = blockIdx.x;
  1028. const block_q2_K * x = (const block_q2_K *) vx;
  1029. const int tid = threadIdx.x;
  1030. const int n = tid/32;
  1031. const int l = tid - 32*n;
  1032. const int is = 8*n + l/16;
  1033. const uint8_t q = x[i].qs[32*n + l];
  1034. dst_t * y = yy + i*QK_K + 128*n;
  1035. half dall = __low2half(x[i].dm);
  1036. half dmin = __high2half(x[i].dm);
  1037. y[l+ 0] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+0] & 0xF) * ((q >> 0) & 3))), __hmul(dmin, __int2half_rn(x[i].scales[is+0] >> 4)));
  1038. y[l+32] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+2] & 0xF) * ((q >> 2) & 3))), __hmul(dmin, __int2half_rn(x[i].scales[is+2] >> 4)));
  1039. y[l+64] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+4] & 0xF) * ((q >> 4) & 3))), __hmul(dmin, __int2half_rn(x[i].scales[is+4] >> 4)));
  1040. y[l+96] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+6] & 0xF) * ((q >> 6) & 3))), __hmul(dmin, __int2half_rn(x[i].scales[is+6] >> 4)));
  1041. }
  1042. template<typename dst_t>
  1043. static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
  1044. const int i = blockIdx.x;
  1045. const block_q3_K * x = (const block_q3_K *) vx;
  1046. const int r = threadIdx.x/4;
  1047. const int tid = r/2;
  1048. const int is0 = r%2;
  1049. const int l0 = 16*is0 + 4*(threadIdx.x%4);
  1050. const int n = tid / 4;
  1051. const int j = tid - 4*n;
  1052. uint8_t m = 1 << (4*n + j);
  1053. int is = 8*n + 2*j + is0;
  1054. int shift = 2*j;
  1055. int8_t us = is < 4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
  1056. is < 8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) :
  1057. is < 12 ? (x[i].scales[is-8] >> 4) | (((x[i].scales[is+0] >> 4) & 3) << 4) :
  1058. (x[i].scales[is-8] >> 4) | (((x[i].scales[is-4] >> 6) & 3) << 4);
  1059. half d_all = x[i].d;
  1060. half dl = __hmul(d_all, __int2half_rn(us - 32));
  1061. dst_t * y = yy + i*QK_K + 128*n + 32*j;
  1062. const uint8_t * q = x[i].qs + 32*n;
  1063. const uint8_t * hm = x[i].hmask;
  1064. for (int l = l0; l < l0+4; ++l) y[l] = __hmul(dl, __int2half_rn((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4)));
  1065. }
  1066. static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
  1067. if (j < 4) {
  1068. d = q[j] & 63; m = q[j + 4] & 63;
  1069. } else {
  1070. d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
  1071. m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
  1072. }
  1073. }
  1074. template<typename dst_t>
  1075. static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
  1076. const block_q4_K * x = (const block_q4_K *) vx;
  1077. const int i = blockIdx.x;
  1078. // assume 32 threads
  1079. const int tid = threadIdx.x;
  1080. const int il = tid/8;
  1081. const int ir = tid%8;
  1082. const int is = 2*il;
  1083. const int n = 4;
  1084. dst_t * y = yy + i*QK_K + 64*il + n*ir;
  1085. const half dall = __low2half(x[i].dm);
  1086. const half dmin = __high2half(x[i].dm);
  1087. const uint8_t * q = x[i].qs + 32*il + n*ir;
  1088. uint8_t sc, m;
  1089. get_scale_min_k4(is + 0, x[i].scales, sc, m);
  1090. const half d1 = __hmul(dall, __int2half_rn(sc));
  1091. const half m1 = __hmul(dmin, __int2half_rn(m));
  1092. get_scale_min_k4(is + 1, x[i].scales, sc, m);
  1093. const half d2 = __hmul(dall, __int2half_rn(sc));
  1094. const half m2 = __hmul(dmin, __int2half_rn(m));
  1095. for (int l = 0; l < n; ++l) {
  1096. y[l + 0] = __hsub(__hmul(d1, __int2half_rn(q[l] & 0xF)), m1);
  1097. y[l +32] = __hsub(__hmul(d2, __int2half_rn(q[l] >> 4)), m2);
  1098. }
  1099. }
  1100. template<typename dst_t>
  1101. static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
  1102. const block_q5_K * x = (const block_q5_K *) vx;
  1103. const int i = blockIdx.x;
  1104. // assume 64 threads - this is very slightly better than the one below
  1105. const int tid = threadIdx.x;
  1106. const int il = tid/16; // il is in 0...3
  1107. const int ir = tid%16; // ir is in 0...15
  1108. const int is = 2*il; // is is in 0...6
  1109. dst_t * y = yy + i*QK_K + 64*il + 2*ir;
  1110. const half dall = __low2half(x[i].dm);
  1111. const half dmin = __high2half(x[i].dm);
  1112. const uint8_t * ql = x[i].qs + 32*il + 2*ir;
  1113. const uint8_t * qh = x[i].qh + 2*ir;
  1114. uint8_t sc, m;
  1115. get_scale_min_k4(is + 0, x[i].scales, sc, m);
  1116. const half d1 = __hmul(dall, __int2half_rn(sc)); const half m1 = __hmul(dmin, __int2half_rn(m));
  1117. get_scale_min_k4(is + 1, x[i].scales, sc, m);
  1118. const half d2 = __hmul(dall, __int2half_rn(sc)); const half m2 = __hmul(dmin, __int2half_rn(m));
  1119. uint8_t hm = 1 << (2*il);
  1120. y[ 0] = __hsub(__hmul(d1, __int2half_rn((ql[0] & 0xF) + (qh[0] & hm ? 16 : 0))), m1);
  1121. y[ 1] = __hsub(__hmul(d1, __int2half_rn((ql[1] & 0xF) + (qh[1] & hm ? 16 : 0))), m1);
  1122. hm <<= 1;
  1123. y[32] = __hsub(__hmul(d2, __int2half_rn((ql[0] >> 4) + (qh[0] & hm ? 16 : 0))), m2);
  1124. y[33] = __hsub(__hmul(d2, __int2half_rn((ql[1] >> 4) + (qh[1] & hm ? 16 : 0))), m2);
  1125. }
  1126. template<typename dst_t>
  1127. static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
  1128. const block_q6_K * x = (const block_q6_K *) vx;
  1129. const int i = blockIdx.x;
  1130. // assume 64 threads - this is very slightly better than the one below
  1131. const int tid = threadIdx.x;
  1132. const int ip = tid/32; // ip is 0 or 1
  1133. const int il = tid - 32*ip; // 0...32
  1134. const int is = 8*ip + il/16;
  1135. dst_t * y = yy + i*QK_K + 128*ip + il;
  1136. const half d = x[i].d;
  1137. const uint8_t * ql = x[i].ql + 64*ip + il;
  1138. const uint8_t qh = x[i].qh[32*ip + il];
  1139. const int8_t * sc = x[i].scales + is;
  1140. y[ 0] = __hmul(d, __int2half_rn(sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32)));
  1141. y[32] = __hmul(d, __int2half_rn(sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32)));
  1142. y[64] = __hmul(d, __int2half_rn(sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32)));
  1143. y[96] = __hmul(d, __int2half_rn(sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32)));
  1144. }
  1145. template<typename dst_t>
  1146. static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
  1147. const int i = blockIdx.x;
  1148. const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
  1149. const int tid = threadIdx.x;
  1150. const int il = tid/8; // 0...3
  1151. const int ib = tid%8; // 0...7
  1152. dst_t * y = yy + i*QK_K + 32*ib + 8*il;
  1153. const uint16_t * q2 = x[i].qs + 4*ib;
  1154. const uint8_t * aux8 = (const uint8_t *)q2;
  1155. const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[il]);
  1156. const uint32_t aux32 = q2[2] | (q2[3] << 16);
  1157. const float d = __half2float(x[i].d) * (0.5f + (aux32 >> 28)) * 0.25f;
  1158. const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
  1159. for (int j = 0; j < 8; ++j) y[j] = __float2half(d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f));
  1160. }
  1161. template<typename dst_t>
  1162. static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
  1163. const int i = blockIdx.x;
  1164. const block_iq2_xs * x = (const block_iq2_xs *) vx;
  1165. const int tid = threadIdx.x;
  1166. const int il = tid/8; // 0...3
  1167. const int ib = tid%8; // 0...7
  1168. dst_t * y = yy + i*QK_K + 32*ib + 8*il;
  1169. const uint16_t * q2 = x[i].qs + 4*ib;
  1170. const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511));
  1171. const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
  1172. const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
  1173. for (int j = 0; j < 8; ++j) y[j] = __float2half(d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f));
  1174. }
  1175. template<typename dst_t>
  1176. static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
  1177. const int i = blockIdx.x;
  1178. const block_iq2_s * x = (const block_iq2_s *) vx;
  1179. const int tid = threadIdx.x;
  1180. const int il = tid/8; // 0...3
  1181. const int ib = tid%8; // 0...7
  1182. dst_t * y = yy + i*QK_K + 32*ib + 8*il;
  1183. const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
  1184. const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
  1185. const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
  1186. for (int j = 0; j < 8; ++j) y[j] = __float2half(d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f));
  1187. }
  1188. template<typename dst_t>
  1189. static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
  1190. const int i = blockIdx.x;
  1191. const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
  1192. const int tid = threadIdx.x;
  1193. const int il = tid/8; // 0...3
  1194. const int ib = tid%8; // 0...7
  1195. dst_t * y = yy + i*QK_K + 32*ib + 8*il;
  1196. const uint8_t * q3 = x[i].qs + 8*ib;
  1197. const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib;
  1198. const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*il+0]);
  1199. const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*il+1]);
  1200. const uint32_t aux32 = gas[0] | (gas[1] << 16);
  1201. const float d = __half2float(x[i].d) * (0.5f + (aux32 >> 28)) * 0.5f;
  1202. const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
  1203. for (int j = 0; j < 4; ++j) {
  1204. y[j+0] = __float2half(d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f));
  1205. y[j+4] = __float2half(d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f));
  1206. }
  1207. }
  1208. template<typename dst_t>
  1209. static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
  1210. const int i = blockIdx.x;
  1211. const block_iq3_s * x = (const block_iq3_s *) vx;
  1212. const int tid = threadIdx.x;
  1213. const int il = tid/8; // 0...3
  1214. const int ib = tid%8; // 0...7
  1215. dst_t * y = yy + i*QK_K + 32*ib + 8*il;
  1216. const uint8_t * qs = x[i].qs + 8*ib;
  1217. const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)));
  1218. const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256)));
  1219. const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf)) * 0.5f;
  1220. const uint8_t signs = x[i].signs[4*ib + il];
  1221. for (int j = 0; j < 4; ++j) {
  1222. y[j+0] = __float2half(d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f));
  1223. y[j+4] = __float2half(d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f));
  1224. }
  1225. }
  1226. template<typename dst_t>
  1227. static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
  1228. const int i = blockIdx.x;
  1229. const block_iq1_s * x = (const block_iq1_s *) vx;
  1230. const int tid = threadIdx.x;
  1231. const int il = tid/8; // 0...3
  1232. const int ib = tid%8; // 0...7
  1233. dst_t * y = yy + i*QK_K + 32*ib + 8*il;
  1234. const int i8 = 4*ib+il;
  1235. uint8_t h = x[i].scales[i8/2] >> 4*(i8%2);
  1236. const int8_t * grid = (const int8_t *)(iq1s_grid + (x[i].qs[i8] | ((h & 8) << 5)));
  1237. const float d = __half2float(x[i].d) * (2*(h & 7) + 1);
  1238. for (int j = 0; j < 8; ++j) y[j] = __float2half(d * grid[j]);
  1239. }
  1240. template<typename dst_t>
  1241. static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {
  1242. const int i = blockIdx.x;
  1243. const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);
  1244. const int tid = threadIdx.x;
  1245. const int il = tid/8; // 0...3
  1246. const int ib = tid%8; // 0...7
  1247. dst_t * y = yy + i*QK_K + 32*ib + 4*il;
  1248. const uint8_t * q4 = x[ib].qs + 4*il;
  1249. const float d = __half2float(x[ib].d);
  1250. for (int j = 0; j < 4; ++j) {
  1251. y[j+ 0] = __float2half(d * kvalues_iq4nl[q4[j] & 0xf]);
  1252. y[j+16] = __float2half(d * kvalues_iq4nl[q4[j] >> 4]);
  1253. }
  1254. }
  1255. template<typename dst_t>
  1256. static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
  1257. const int i = blockIdx.x;
  1258. const block_iq4_xs * x = (const block_iq4_xs *)vx;
  1259. const int tid = threadIdx.x;
  1260. const int il = tid/8; // 0...3
  1261. const int ib = tid%8; // 0...7
  1262. dst_t * y = yy + i*QK_K + 32*ib + 4*il;
  1263. const uint8_t * q4 = x[i].qs + 16*ib + 4*il;
  1264. const float d = __half2float(x[i].d) * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
  1265. for (int j = 0; j < 4; ++j) {
  1266. y[j+ 0] = __float2half(d * kvalues_iq4nl[q4[j] & 0xf]);
  1267. y[j+16] = __float2half(d * kvalues_iq4nl[q4[j] >> 4]);
  1268. }
  1269. }
  1270. template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
  1271. static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
  1272. const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE);
  1273. dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
  1274. }
  1275. template<typename dst_t>
  1276. static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
  1277. const int nb = k / QK_K;
  1278. dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
  1279. }
  1280. template<typename dst_t>
  1281. static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
  1282. const int nb = k / QK_K;
  1283. dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
  1284. }
  1285. template<typename dst_t>
  1286. static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
  1287. const int nb = k / QK_K;
  1288. dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
  1289. }
  1290. template<typename dst_t>
  1291. static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
  1292. const int nb = k / QK_K;
  1293. dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
  1294. }
  1295. template<typename dst_t>
  1296. static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
  1297. const int nb = k / QK_K;
  1298. dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
  1299. }
  1300. template<typename dst_t>
  1301. static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
  1302. const int nb = k / QK_K;
  1303. dequantize_block_iq2_xxs<<<nb, 32, 0, stream>>>(vx, y);
  1304. }
  1305. template<typename dst_t>
  1306. static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
  1307. const int nb = k / QK_K;
  1308. dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
  1309. }
  1310. template<typename dst_t>
  1311. static void dequantize_row_iq2_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
  1312. const int nb = k / QK_K;
  1313. dequantize_block_iq2_s<<<nb, 32, 0, stream>>>(vx, y);
  1314. }
  1315. template<typename dst_t>
  1316. static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
  1317. const int nb = k / QK_K;
  1318. dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
  1319. }
  1320. template<typename dst_t>
  1321. static void dequantize_row_iq3_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
  1322. const int nb = k / QK_K;
  1323. dequantize_block_iq3_s<<<nb, 32, 0, stream>>>(vx, y);
  1324. }
  1325. template<typename dst_t>
  1326. static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
  1327. const int nb = k / QK_K;
  1328. dequantize_block_iq1_s<<<nb, 32, 0, stream>>>(vx, y);
  1329. }
  1330. template<typename dst_t>
  1331. static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
  1332. const int nb = (k + QK_K - 1) / QK_K;
  1333. dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
  1334. }
  1335. template<typename dst_t>
  1336. static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
  1337. const int nb = (k + QK_K - 1) / QK_K;
  1338. dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
  1339. }
  1340. static to_fp16_cuda_t ggml_get_to_fp16_cuda(int type) {
  1341. switch (type) {
  1342. case 2:
  1343. return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
  1344. case 3:
  1345. return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
  1346. case 6:
  1347. return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
  1348. case 7:
  1349. return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
  1350. case 8:
  1351. return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
  1352. case 10:
  1353. return dequantize_row_q2_K_cuda;
  1354. case 11:
  1355. return dequantize_row_q3_K_cuda;
  1356. case 12:
  1357. return dequantize_row_q4_K_cuda;
  1358. case 13:
  1359. return dequantize_row_q5_K_cuda;
  1360. case 14:
  1361. return dequantize_row_q6_K_cuda;
  1362. case 16:
  1363. return dequantize_row_iq2_xxs_cuda;
  1364. case 17:
  1365. return dequantize_row_iq2_xs_cuda;
  1366. case 18:
  1367. return dequantize_row_iq3_xxs_cuda;
  1368. case 19:
  1369. return dequantize_row_iq1_s_cuda;
  1370. case 20:
  1371. return dequantize_row_iq4_nl_cuda;
  1372. case 21:
  1373. return dequantize_row_iq3_s_cuda;
  1374. case 22:
  1375. return dequantize_row_iq2_s_cuda;
  1376. case 23:
  1377. return dequantize_row_iq4_xs_cuda;
  1378. default:
  1379. return nullptr;
  1380. }
  1381. }
  1382. // GEMV
  1383. template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
  1384. static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, dfloat * __restrict__ dst, const int ncols, const int nrows) {
  1385. // qk = quantized weights per x block
  1386. // qr = number of quantized weights per data value in x block
  1387. const int row = blockIdx.x*blockDim.y + threadIdx.y;
  1388. if (row >= nrows) {
  1389. return;
  1390. }
  1391. const int tid = threadIdx.x;
  1392. const int iter_stride = 2*GGML_CUDA_DMMV_X;
  1393. const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
  1394. const int y_offset = qr == 1 ? 1 : qk/2;
  1395. half2 tmp = __floats2half2_rn(0.0f, 0.0f); // two sums for f16 to take advantage of half2 intrinsics
  1396. for (int i = 0; i < ncols; i += iter_stride) {
  1397. const int col = i + vals_per_iter*tid;
  1398. const int ib = (row*ncols + col)/qk; // x block index
  1399. const int iqs = (col%qk)/qr; // x quant index
  1400. const int iybs = col - col%qk; // y block start index
  1401. // processing >2 values per i iter is faster for fast GPUs
  1402. #pragma unroll
  1403. for (int j = 0; j < vals_per_iter; j += 2) {
  1404. // process 2 vals per j iter
  1405. // dequantize
  1406. // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
  1407. dfloat2 v;
  1408. dequantize_kernel(vx, ib, iqs + j/qr, v);
  1409. // matrix multiplication
  1410. // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
  1411. tmp = __hadd2(tmp, __hmul2(v, {
  1412. y[iybs + iqs + j/qr + 0],
  1413. y[iybs + iqs + j/qr + y_offset]
  1414. }));
  1415. }
  1416. }
  1417. // sum up partial sums and write back result
  1418. #pragma unroll
  1419. for (int mask = 16; mask > 0; mask >>= 1) {
  1420. tmp = __hadd2(tmp, __shfl_xor_sync(0xffffffff, tmp, mask, 32));
  1421. }
  1422. if (tid == 0) {
  1423. dst[row] = __hadd(tmp.x, tmp.y);
  1424. }
  1425. }
  1426. static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const dfloat * __restrict__ yy, dfloat * __restrict__ dst, const int ncols, int nrows) {
  1427. static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
  1428. const int row = blockIdx.x*blockDim.y + threadIdx.y;
  1429. if (row > nrows) return;
  1430. const int num_blocks_per_row = ncols / QK_K;
  1431. const int ib0 = row*num_blocks_per_row;
  1432. const block_q2_K * x = (const block_q2_K *)vx + ib0;
  1433. float tmp = 0; // partial sum for thread in warp
  1434. const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15
  1435. const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
  1436. const int step = 16/K_QUANTS_PER_ITERATION;
  1437. const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
  1438. const int in = tid - step*im; // 0...15 or 0...7
  1439. const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2
  1440. const int q_offset = 32*im + l0;
  1441. const int s_offset = 8*im;
  1442. const int y_offset = 128*im + l0;
  1443. uint32_t aux[4];
  1444. const uint8_t * d = (const uint8_t *)aux;
  1445. const uint8_t * m = (const uint8_t *)(aux + 2);
  1446. for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
  1447. const half * y = yy + i * QK_K + y_offset;
  1448. const uint8_t * q = x[i].qs + q_offset;
  1449. const float dall = __low2float(x[i].dm);
  1450. const float dmin = __high2float(x[i].dm);
  1451. const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
  1452. aux[0] = a[0] & 0x0f0f0f0f;
  1453. aux[1] = a[1] & 0x0f0f0f0f;
  1454. aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
  1455. aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
  1456. float sum1 = 0, sum2 = 0;
  1457. for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
  1458. sum1 += __half2float(y[l+ 0]) * d[0] * ((q[l+ 0] >> 0) & 3)
  1459. + __half2float(y[l+32]) * d[2] * ((q[l+ 0] >> 2) & 3)
  1460. + __half2float(y[l+64]) * d[4] * ((q[l+ 0] >> 4) & 3)
  1461. + __half2float(y[l+96]) * d[6] * ((q[l+ 0] >> 6) & 3)
  1462. + __half2float(y[l+16]) * d[1] * ((q[l+16] >> 0) & 3)
  1463. + __half2float(y[l+48]) * d[3] * ((q[l+16] >> 2) & 3)
  1464. + __half2float(y[l+80]) * d[5] * ((q[l+16] >> 4) & 3)
  1465. +__half2float(y[l+112]) * d[7] * ((q[l+16] >> 6) & 3);
  1466. sum2 += __half2float(y[l+ 0]) * m[0] + __half2float(y[l+32]) * m[2] + __half2float(y[l+64]) * m[4] + __half2float(y[ l+96]) * m[6]
  1467. + __half2float(y[l+16]) * m[1] + __half2float(y[l+48]) * m[3] + __half2float(y[l+80]) * m[5] + __half2float(y[l+112]) * m[7];
  1468. }
  1469. tmp += dall * sum1 - dmin * sum2;
  1470. }
  1471. // sum up partial sums and write back result
  1472. #pragma unroll
  1473. for (int mask = 16; mask > 0; mask >>= 1) {
  1474. tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
  1475. }
  1476. if (threadIdx.x == 0) {
  1477. dst[row] = __float2half(tmp);
  1478. }
  1479. }
  1480. static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const dfloat * __restrict__ yy, dfloat * __restrict__ dst, const int ncols, int nrows) {
  1481. const int row = blockIdx.x*blockDim.y + threadIdx.y;
  1482. if (row > nrows) return;
  1483. const int num_blocks_per_row = ncols / QK_K;
  1484. const int ib0 = row*num_blocks_per_row;
  1485. const block_q3_K * x = (const block_q3_K *)vx + ib0;
  1486. float tmp = 0; // partial sum for thread in warp
  1487. const uint16_t kmask1 = 0x0303;
  1488. const uint16_t kmask2 = 0x0f0f;
  1489. const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
  1490. const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
  1491. const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop
  1492. const int step = 16/K_QUANTS_PER_ITERATION;
  1493. const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
  1494. const int in = tid - step*im; // 0....15 or 0...7
  1495. const uint8_t m = 1 << (4*im);
  1496. const int l0 = n*in; // 0...15 or 0...14 in steps of 2
  1497. const int q_offset = 32*im + l0;
  1498. const int y_offset = 128*im + l0;
  1499. uint16_t utmp[4];
  1500. const int8_t * s = (const int8_t *)utmp;
  1501. const uint16_t s_shift = 4*im;
  1502. for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
  1503. const half * y = yy + i * QK_K + y_offset;
  1504. const uint8_t * q = x[i].qs + q_offset;
  1505. const uint8_t * h = x[i].hmask + l0;
  1506. const uint16_t * a = (const uint16_t *)x[i].scales;
  1507. utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
  1508. utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
  1509. utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
  1510. utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
  1511. const float d = __half2float(x[i].d);
  1512. float sum = 0;
  1513. for (int l = 0; l < n; ++l) {
  1514. sum += __half2float(y[l+ 0]) * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
  1515. + __half2float(y[l+32]) * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
  1516. + __half2float(y[l+64]) * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
  1517. + __half2float(y[l+96]) * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
  1518. sum += __half2float(y[l+16]) * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
  1519. + __half2float(y[l+48]) * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
  1520. + __half2float(y[l+80]) * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
  1521. + __half2float(y[l+112]) * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
  1522. }
  1523. tmp += d * sum;
  1524. }
  1525. // sum up partial sums and write back result
  1526. #pragma unroll
  1527. for (int mask = 16; mask > 0; mask >>= 1) {
  1528. tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
  1529. }
  1530. if (threadIdx.x == 0) {
  1531. dst[row] = __float2half(tmp);
  1532. }
  1533. }
  1534. static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const dfloat * __restrict__ yy, dfloat * __restrict__ dst, const int ncols, int nrows) {
  1535. const int row = blockIdx.x*blockDim.y + threadIdx.y;
  1536. if (row > nrows) return;
  1537. const int num_blocks_per_row = ncols / QK_K;
  1538. const int ib0 = row*num_blocks_per_row;
  1539. const block_q4_K * x = (const block_q4_K *)vx + ib0;
  1540. const uint16_t kmask1 = 0x3f3f;
  1541. const uint16_t kmask2 = 0x0f0f;
  1542. const uint16_t kmask3 = 0xc0c0;
  1543. const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
  1544. const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
  1545. const int step = 8/K_QUANTS_PER_ITERATION; // 8 or 4
  1546. const int il = tid/step; // 0...3
  1547. const int ir = tid - step*il; // 0...7 or 0...3
  1548. const int n = 2 * K_QUANTS_PER_ITERATION; // 2 or 4
  1549. const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
  1550. const int in = il%2;
  1551. const int l0 = n*(2*ir + in);
  1552. const int q_offset = 32*im + l0;
  1553. const int y_offset = 64*im + l0;
  1554. uint16_t aux[4];
  1555. const uint8_t * sc = (const uint8_t *)aux;
  1556. #if K_QUANTS_PER_ITERATION == 2
  1557. uint32_t q32[4];
  1558. const uint8_t * q4 = (const uint8_t *)q32;
  1559. #else
  1560. uint16_t q16[4];
  1561. const uint8_t * q4 = (const uint8_t *)q16;
  1562. #endif
  1563. float tmp = 0; // partial sum for thread in warp
  1564. for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
  1565. const half * y1 = yy + i*QK_K + y_offset;
  1566. const half * y2 = y1 + 128;
  1567. const float dall = __low2float(x[i].dm);
  1568. const float dmin = __high2float(x[i].dm);
  1569. const uint16_t * a = (const uint16_t *)x[i].scales;
  1570. aux[0] = a[im+0] & kmask1;
  1571. aux[1] = a[im+2] & kmask1;
  1572. aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
  1573. aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
  1574. #if K_QUANTS_PER_ITERATION == 2
  1575. const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
  1576. const uint32_t * q2 = q1 + 16;
  1577. q32[0] = q1[0] & 0x0f0f0f0f;
  1578. q32[1] = q1[0] & 0xf0f0f0f0;
  1579. q32[2] = q2[0] & 0x0f0f0f0f;
  1580. q32[3] = q2[0] & 0xf0f0f0f0;
  1581. float4 s = {0.f, 0.f, 0.f, 0.f};
  1582. float smin = 0;
  1583. for (int l = 0; l < 4; ++l) {
  1584. s.x += __half2float(y1[l]) * q4[l+0]; s.y += __half2float(y1[l+32]) * q4[l+ 4];
  1585. s.z += __half2float(y2[l]) * q4[l+8]; s.w += __half2float(y2[l+32]) * q4[l+12];
  1586. smin += __half2float(y1[l]) * sc[2] + __half2float(y1[l+32]) * sc[3] + __half2float(y2[l]) * sc[6] + __half2float(y2[l+32]) * sc[7];
  1587. }
  1588. tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
  1589. #else
  1590. const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
  1591. const uint16_t * q2 = q1 + 32;
  1592. q16[0] = q1[0] & 0x0f0f;
  1593. q16[1] = q1[0] & 0xf0f0;
  1594. q16[2] = q2[0] & 0x0f0f;
  1595. q16[3] = q2[0] & 0xf0f0;
  1596. float4 s = {0.f, 0.f, 0.f, 0.f};
  1597. float smin = 0;
  1598. for (int l = 0; l < 2; ++l) {
  1599. s.x += __half2float(y1[l]) * q4[l+0]; s.y += __half2float(y1[l+32]) * q4[l+2];
  1600. s.z += __half2float(y2[l]) * q4[l+4]; s.w += __half2float(y2[l+32]) * q4[l+6];
  1601. smin += __half2float(y1[l]) * sc[2] + __half2float(y1[l+32]) * sc[3] + __half2float(y2[l]) * sc[6] + __half2float(y2[l+32]) * sc[7];
  1602. }
  1603. tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
  1604. #endif
  1605. }
  1606. // sum up partial sums and write back result
  1607. #pragma unroll
  1608. for (int mask = 16; mask > 0; mask >>= 1) {
  1609. tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
  1610. }
  1611. if (tid == 0) {
  1612. dst[row] = __float2half(tmp);
  1613. }
  1614. }
  1615. static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const dfloat * __restrict__ yy, dfloat * __restrict__ dst, const int ncols) {
  1616. const int row = blockIdx.x;
  1617. const int num_blocks_per_row = ncols / QK_K;
  1618. const int ib0 = row*num_blocks_per_row;
  1619. const block_q5_K * x = (const block_q5_K *)vx + ib0;
  1620. float tmp = 0; // partial sum for thread in warp
  1621. const uint16_t kmask1 = 0x3f3f;
  1622. const uint16_t kmask2 = 0x0f0f;
  1623. const uint16_t kmask3 = 0xc0c0;
  1624. const int tid = threadIdx.x/2; // 0...15
  1625. const int ix = threadIdx.x%2;
  1626. const int il = tid/4; // 0...3
  1627. const int ir = tid - 4*il;// 0...3
  1628. const int n = 2;
  1629. const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
  1630. const int in = il%2;
  1631. const int l0 = n*(2*ir + in);
  1632. const int q_offset = 32*im + l0;
  1633. const int y_offset = 64*im + l0;
  1634. const uint8_t hm1 = 1 << (2*im);
  1635. const uint8_t hm2 = hm1 << 4;
  1636. uint16_t aux[4];
  1637. const uint8_t * sc = (const uint8_t *)aux;
  1638. uint16_t q16[8];
  1639. const uint8_t * q4 = (const uint8_t *)q16;
  1640. for (int i = ix; i < num_blocks_per_row; i += 2) {
  1641. const uint8_t * ql1 = x[i].qs + q_offset;
  1642. const uint8_t * qh = x[i].qh + l0;
  1643. const half * y1 = yy + i*QK_K + y_offset;
  1644. const half * y2 = y1 + 128;
  1645. const float dall = __low2float(x[i].dm);
  1646. const float dmin = __high2float(x[i].dm);
  1647. const uint16_t * a = (const uint16_t *)x[i].scales;
  1648. aux[0] = a[im+0] & kmask1;
  1649. aux[1] = a[im+2] & kmask1;
  1650. aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
  1651. aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
  1652. float4 sum = {0.f, 0.f, 0.f, 0.f};
  1653. float smin = 0;
  1654. const uint16_t * q1 = (const uint16_t *)ql1;
  1655. const uint16_t * q2 = q1 + 32;
  1656. q16[0] = q1[0] & 0x0f0f;
  1657. q16[1] = q1[8] & 0x0f0f;
  1658. q16[2] = (q1[0] >> 4) & 0x0f0f;
  1659. q16[3] = (q1[8] >> 4) & 0x0f0f;
  1660. q16[4] = q2[0] & 0x0f0f;
  1661. q16[5] = q2[8] & 0x0f0f;
  1662. q16[6] = (q2[0] >> 4) & 0x0f0f;
  1663. q16[7] = (q2[8] >> 4) & 0x0f0f;
  1664. for (int l = 0; l < n; ++l) {
  1665. sum.x += __half2float(y1[l+ 0]) * (q4[l +0] + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
  1666. + __half2float(y1[l+16]) * (q4[l +2] + (qh[l+16] & (hm1 << 0) ? 16 : 0));
  1667. sum.y += __half2float(y1[l+32]) * (q4[l +4] + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
  1668. + __half2float(y1[l+48]) * (q4[l +6] + (qh[l+16] & (hm1 << 1) ? 16 : 0));
  1669. sum.z += __half2float(y2[l+ 0]) * (q4[l +8] + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
  1670. + __half2float(y2[l+16]) * (q4[l+10] + (qh[l+16] & (hm2 << 0) ? 16 : 0));
  1671. sum.w += __half2float(y2[l+32]) * (q4[l+12] + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
  1672. + __half2float(y2[l+48]) * (q4[l+14] + (qh[l+16] & (hm2 << 1) ? 16 : 0));
  1673. smin += (__half2float(y1[l]) + __half2float(y1[l+16])) * sc[2] + (__half2float(y1[l+32]) + __half2float(y1[l+48])) * sc[3]
  1674. + (__half2float(y2[l]) + __half2float(y2[l+16])) * sc[6] + (__half2float(y2[l+32]) + __half2float(y2[l+48])) * sc[7];
  1675. }
  1676. tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
  1677. }
  1678. // sum up partial sums and write back result
  1679. #pragma unroll
  1680. for (int mask = 16; mask > 0; mask >>= 1) {
  1681. tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
  1682. }
  1683. if (threadIdx.x == 0) {
  1684. dst[row] = __float2half(tmp);
  1685. }
  1686. }
  1687. static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const dfloat * __restrict__ yy, dfloat * __restrict__ dst, const int ncols, int nrows) {
  1688. static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
  1689. const int row = blockIdx.x*blockDim.y + threadIdx.y;
  1690. if (row > nrows) return;
  1691. const int num_blocks_per_row = ncols / QK_K;
  1692. const int ib0 = row*num_blocks_per_row;
  1693. const block_q6_K * x = (const block_q6_K *)vx + ib0;
  1694. const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
  1695. const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1
  1696. const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8
  1697. const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
  1698. const int in = tid - step*im; // 0...15 or 0...7
  1699. #if K_QUANTS_PER_ITERATION == 1
  1700. const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
  1701. const int is = 0;
  1702. #else
  1703. const int l0 = 4 * in; // 0, 4, 8, ..., 28
  1704. const int is = in / 4;
  1705. #endif
  1706. const int ql_offset = 64*im + l0;
  1707. const int qh_offset = 32*im + l0;
  1708. const int s_offset = 8*im + is;
  1709. const int y_offset = 128*im + l0;
  1710. float tmp = 0; // partial sum for thread in warp
  1711. for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
  1712. const half * y = yy + i * QK_K + y_offset;
  1713. const uint8_t * ql = x[i].ql + ql_offset;
  1714. const uint8_t * qh = x[i].qh + qh_offset;
  1715. const int8_t * s = x[i].scales + s_offset;
  1716. const float d = __half2float(x[i].d);
  1717. #if K_QUANTS_PER_ITERATION == 1
  1718. float sum = __half2float(y[ 0]) * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
  1719. + __half2float(y[16]) * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
  1720. + __half2float(y[32]) * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
  1721. + __half2float(y[48]) * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
  1722. + __half2float(y[64]) * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
  1723. + __half2float(y[80]) * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
  1724. + __half2float(y[96]) * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
  1725. +__half2float(y[112]) * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
  1726. tmp += sum;
  1727. #else
  1728. float sum = 0;
  1729. for (int l = 0; l < 4; ++l) {
  1730. sum += __half2float(y[l+ 0]) * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
  1731. + __half2float(y[l+32]) * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
  1732. + __half2float(y[l+64]) * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
  1733. + __half2float(y[l+96]) * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
  1734. }
  1735. tmp += sum;
  1736. #endif
  1737. }
  1738. // sum up partial sums and write back result
  1739. #pragma unroll
  1740. for (int mask = 16; mask > 0; mask >>= 1) {
  1741. tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
  1742. }
  1743. if (tid == 0) {
  1744. dst[row] = __float2half(tmp);
  1745. }
  1746. }
  1747. static __global__ void dequantize_mul_mat_vec_iq2_xxs(const void * __restrict__ vx, const dfloat * __restrict__ yy, dfloat * __restrict__ dst, const int ncols, int nrows) {
  1748. const int row = blockIdx.x*blockDim.y + threadIdx.y;
  1749. if (row > nrows) return;
  1750. const int num_blocks_per_row = ncols / QK_K;
  1751. const int ib0 = row*num_blocks_per_row;
  1752. const block_iq2_xxs * x = (const block_iq2_xxs *)vx + ib0;
  1753. float tmp = 0; // partial sum for thread in warp
  1754. const int tid = threadIdx.x/4;
  1755. const int ix = threadIdx.x%4;
  1756. const int q_offset = tid * 4;
  1757. const int y_offset = tid * 32;
  1758. for (int i = ix; i < num_blocks_per_row; i += 4) {
  1759. const half * y = yy + i * QK_K + y_offset;
  1760. const uint16_t * q = x[i].qs + q_offset;
  1761. const uint8_t * aux8 = (const uint8_t *)q;
  1762. uint32_t aux32 = q[2] | (q[3] << 16);
  1763. float sumi = 0;
  1764. for (int l = 0; l < 4; ++l) {
  1765. const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
  1766. const uint8_t signs = ksigns_iq2xs[aux32 & 127];
  1767. for (int j = 0; j < 8; ++j) {
  1768. sumi += __half2float(y[j]) * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
  1769. }
  1770. y += 8;
  1771. aux32 >>= 7;
  1772. }
  1773. tmp += sumi * __half2float(x[i].d) * (0.5f + aux32) * 0.25f;;
  1774. }
  1775. // sum up partial sums and write back result
  1776. #pragma unroll
  1777. for (int mask = 16; mask > 0; mask >>= 1) {
  1778. tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
  1779. }
  1780. if (threadIdx.x == 0) {
  1781. dst[row] = __float2half(tmp);
  1782. }
  1783. }
  1784. static __global__ void dequantize_mul_mat_vec_iq2_xs(const void * __restrict__ vx, const dfloat * __restrict__ yy, dfloat * __restrict__ dst, const int ncols, int nrows) {
  1785. const int row = blockIdx.x*blockDim.y + threadIdx.y;
  1786. if (row > nrows) return;
  1787. const int num_blocks_per_row = ncols / QK_K;
  1788. const int ib0 = row*num_blocks_per_row;
  1789. const block_iq2_xs * x = (const block_iq2_xs *)vx + ib0;
  1790. float tmp = 0; // partial sum for thread in warp
  1791. const int tid = threadIdx.x/4;
  1792. const int ix = threadIdx.x%4;
  1793. const int q_offset = tid * 4;
  1794. const int s_offset = tid;
  1795. const int y_offset = tid * 32;
  1796. for (int i = ix; i < num_blocks_per_row; i += 4) {
  1797. const half * y = yy + i * QK_K + y_offset;
  1798. const uint16_t * q = x[i].qs + q_offset;
  1799. const uint8_t ls1 = x[i].scales[s_offset] & 0xf;
  1800. const uint8_t ls2 = x[i].scales[s_offset] >> 4;
  1801. float sumi1 = 0;
  1802. for (int l = 0; l < 2; ++l) {
  1803. const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q[l] & 511));
  1804. const uint8_t signs = ksigns_iq2xs[q[l] >> 9];
  1805. for (int j = 0; j < 8; ++j) {
  1806. sumi1 += __half2float(y[j]) * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
  1807. }
  1808. y += 8;
  1809. }
  1810. float sumi2 = 0;
  1811. for (int l = 2; l < 4; ++l) {
  1812. const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q[l] & 511));
  1813. const uint8_t signs = ksigns_iq2xs[q[l] >> 9];
  1814. for (int j = 0; j < 8; ++j) {
  1815. sumi2 += __half2float(y[j]) * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
  1816. }
  1817. y += 8;
  1818. }
  1819. const float d = __half2float(x[i].d) * 0.25f;
  1820. tmp += d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);;
  1821. }
  1822. // sum up partial sums and write back result
  1823. #pragma unroll
  1824. for (int mask = 16; mask > 0; mask >>= 1) {
  1825. tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
  1826. }
  1827. if (threadIdx.x == 0) {
  1828. dst[row] = __float2half(tmp);
  1829. }
  1830. }
  1831. static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, dfloat * dst, const int ncols, const int nrows, cudaStream_t stream) {
  1832. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  1833. // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
  1834. const dim3 block_nums(block_num_y, 1, 1);
  1835. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  1836. dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
  1837. <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
  1838. }
  1839. static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, dfloat * dst, const int ncols, const int nrows, cudaStream_t stream) {
  1840. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  1841. const dim3 block_nums(block_num_y, 1, 1);
  1842. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  1843. dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
  1844. <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
  1845. }
  1846. static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, dfloat * dst, const int ncols, const int nrows, cudaStream_t stream) {
  1847. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  1848. const dim3 block_nums(block_num_y, 1, 1);
  1849. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  1850. dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
  1851. <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
  1852. }
  1853. static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, dfloat * dst, const int ncols, const int nrows, cudaStream_t stream) {
  1854. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  1855. const dim3 block_nums(block_num_y, 1, 1);
  1856. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  1857. dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
  1858. <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
  1859. }
  1860. static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, dfloat * dst, const int ncols, const int nrows, cudaStream_t stream) {
  1861. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  1862. const dim3 block_nums(block_num_y, 1, 1);
  1863. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  1864. dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
  1865. <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
  1866. }
  1867. static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const dfloat * y, dfloat * dst, const int ncols, const int nrows, cudaStream_t stream) {
  1868. const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
  1869. const int block_num_y = (nrows + ny - 1) / ny;
  1870. const dim3 block_nums(block_num_y, 1, 1);
  1871. const dim3 block_dims(32, ny, 1);
  1872. dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
  1873. }
  1874. static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const dfloat * y, dfloat * dst, const int ncols, const int nrows, cudaStream_t stream) {
  1875. const int ny = 2 / K_QUANTS_PER_ITERATION;
  1876. const int block_num_y = (nrows + ny - 1) / ny;
  1877. const dim3 block_nums(block_num_y, 1, 1);
  1878. const dim3 block_dims(32, ny, 1);
  1879. dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
  1880. }
  1881. static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const dfloat * y, dfloat * dst, const int ncols, const int nrows, cudaStream_t stream) {
  1882. const int ny = 2 / K_QUANTS_PER_ITERATION;
  1883. const int block_num_y = (nrows + ny - 1) / ny;
  1884. const dim3 block_nums(block_num_y, 1, 1);
  1885. const dim3 block_dims(32, ny, 1);
  1886. dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
  1887. }
  1888. static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const dfloat * y, dfloat * dst, const int ncols, const int nrows, cudaStream_t stream) {
  1889. const dim3 block_dims(32, 1, 1);
  1890. dequantize_mul_mat_vec_q5_k<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols);
  1891. }
  1892. static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const dfloat * y, dfloat * dst, const int ncols, const int nrows, cudaStream_t stream) {
  1893. const int ny = 2 / K_QUANTS_PER_ITERATION;
  1894. const int block_num_y = (nrows + ny - 1) / ny;
  1895. const dim3 block_nums(block_num_y, 1, 1);
  1896. const dim3 block_dims(32, ny, 1);
  1897. dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
  1898. }
  1899. static void dequantize_mul_mat_vec_iq2_xxs_cuda(const void * vx, const dfloat * y, dfloat * dst, const int ncols, const int nrows, cudaStream_t stream) {
  1900. const dim3 block_dims(32, 1, 1);
  1901. dequantize_mul_mat_vec_iq2_xxs<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
  1902. }
  1903. static void dequantize_mul_mat_vec_iq2_xs_cuda(const void * vx, const dfloat * y, dfloat * dst, const int ncols, const int nrows, cudaStream_t stream) {
  1904. const dim3 block_dims(32, 1, 1);
  1905. dequantize_mul_mat_vec_iq2_xs<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
  1906. }
  1907. // Q8 gemv
  1908. static __global__ void quantize_q8_1(const half * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
  1909. const int ix = blockDim.x*blockIdx.x + threadIdx.x;
  1910. if (ix >= kx_padded) {
  1911. return;
  1912. }
  1913. const int iy = blockDim.y*blockIdx.y + threadIdx.y;
  1914. const int i_padded = iy*kx_padded + ix;
  1915. block_q8_1 * y = (block_q8_1 *) vy;
  1916. const int ib = i_padded / QK8_1; // block index
  1917. const int iqs = i_padded % QK8_1; // quant index
  1918. const float xi = ix < kx ? __half2float(x[iy*kx + ix]) : 0.0f;
  1919. float amax = fabsf(xi);
  1920. float sum = xi;
  1921. #pragma unroll
  1922. for (int mask = 16; mask > 0; mask >>= 1) {
  1923. amax = fmaxf(amax, __shfl_xor_sync(0xffffffff, amax, mask, 32));
  1924. sum += __shfl_xor_sync(0xffffffff, sum, mask, 32);
  1925. }
  1926. const float d = amax / 127;
  1927. const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
  1928. y[ib].qs[iqs] = q;
  1929. if (iqs > 0) {
  1930. return;
  1931. }
  1932. y[ib].ds.x = __float2half(d);
  1933. y[ib].ds.y = __float2half(sum);
  1934. }
  1935. static void quantize_row_q8_1_cuda(const half * x, void * vy, const int kx, const int ky, cudaStream_t stream) {
  1936. const int64_t kx_padded = (kx + 512 - 1) / 512 * 512;
  1937. const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
  1938. const dim3 num_blocks(block_num_x, ky, 1);
  1939. const dim3 block_size(CUDA_DEQUANTIZE_BLOCK_SIZE, 1, 1);
  1940. quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
  1941. }
  1942. #define VDR_Q4_0_Q8_1_MMVQ 2
  1943. #define VDR_Q4_0_Q8_1_MMQ 4
  1944. template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl(
  1945. const int * v, const int * u, const float & d4, const half2 & ds8) {
  1946. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  1947. int sumi = 0;
  1948. #pragma unroll
  1949. for (int i = 0; i < vdr; ++i) {
  1950. const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
  1951. const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
  1952. // SIMD dot product of quantized values
  1953. sumi = __dp4a(vi0, u[2*i+0], sumi);
  1954. sumi = __dp4a(vi1, u[2*i+1], sumi);
  1955. }
  1956. const float2 ds8f = __half22float2(ds8);
  1957. // second part effectively subtracts 8 from each quant value
  1958. return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
  1959. #endif
  1960. }
  1961. #define VDR_Q4_1_Q8_1_MMVQ 2
  1962. #define VDR_Q4_1_Q8_1_MMQ 4
  1963. template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl(
  1964. const int * v, const int * u, const half2 & dm4, const half2 & ds8) {
  1965. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  1966. int sumi = 0;
  1967. #pragma unroll
  1968. for (int i = 0; i < vdr; ++i) {
  1969. const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
  1970. const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
  1971. // SIMD dot product of quantized values
  1972. sumi = __dp4a(vi0, u[2*i+0], sumi);
  1973. sumi = __dp4a(vi1, u[2*i+1], sumi);
  1974. }
  1975. const float2 tmp = __half22float2(__hmul2(dm4, ds8));
  1976. const float d4d8 = tmp.x;
  1977. const float m4s8 = tmp.y;
  1978. // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
  1979. return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
  1980. #endif
  1981. }
  1982. #define VDR_Q5_0_Q8_1_MMVQ 2
  1983. #define VDR_Q5_0_Q8_1_MMQ 4
  1984. template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl(
  1985. const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) {
  1986. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  1987. int sumi = 0;
  1988. #pragma unroll
  1989. for (int i = 0; i < vdr; ++i) {
  1990. int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
  1991. vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
  1992. vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12
  1993. vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20
  1994. vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28
  1995. sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
  1996. int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
  1997. vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4
  1998. vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12
  1999. vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20
  2000. vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28
  2001. sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
  2002. }
  2003. const float2 ds8f = __half22float2(ds8);
  2004. // second part effectively subtracts 16 from each quant value
  2005. return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
  2006. #endif
  2007. }
  2008. #define VDR_Q5_1_Q8_1_MMVQ 2
  2009. #define VDR_Q5_1_Q8_1_MMQ 4
  2010. template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl(
  2011. const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) {
  2012. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  2013. int sumi = 0;
  2014. #pragma unroll
  2015. for (int i = 0; i < vdr; ++i) {
  2016. int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
  2017. vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
  2018. vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12
  2019. vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20
  2020. vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28
  2021. sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
  2022. int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
  2023. vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4
  2024. vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12
  2025. vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20
  2026. vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28
  2027. sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
  2028. }
  2029. const float2 tmp = __half22float2(__hmul2(dm5, ds8));
  2030. const float d5d8 = tmp.x;
  2031. const float m5s8 = tmp.y;
  2032. // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
  2033. return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
  2034. #endif
  2035. }
  2036. #define VDR_Q8_0_Q8_1_MMVQ 2
  2037. #define VDR_Q8_0_Q8_1_MMQ 8
  2038. template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl(
  2039. const int * v, const int * u, const float & d8_0, const float & d8_1) {
  2040. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  2041. int sumi = 0;
  2042. #pragma unroll
  2043. for (int i = 0; i < vdr; ++i) {
  2044. // SIMD dot product of quantized values
  2045. sumi = __dp4a(v[i], u[i], sumi);
  2046. }
  2047. return d8_0*d8_1 * sumi;
  2048. #endif
  2049. }
  2050. template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl(
  2051. const int * v, const int * u, const half2 & dm8, const half2 & ds8) {
  2052. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  2053. int sumi = 0;
  2054. #pragma unroll
  2055. for (int i = 0; i < vdr; ++i) {
  2056. // SIMD dot product of quantized values
  2057. sumi = __dp4a(v[i], u[i], sumi);
  2058. }
  2059. const float2 tmp = __half22float2(__hmul2(dm8, ds8));
  2060. const float d8d8 = tmp.x;
  2061. const float m8s8 = tmp.y;
  2062. // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
  2063. return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
  2064. #endif
  2065. }
  2066. #define VDR_Q2_K_Q8_1_MMVQ 1
  2067. #define VDR_Q2_K_Q8_1_MMQ 2
  2068. // contiguous v/x values
  2069. static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
  2070. const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
  2071. const half2 & dm2, const float * __restrict__ d8) {
  2072. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  2073. float sumf_d = 0.0f;
  2074. float sumf_m = 0.0f;
  2075. #pragma unroll
  2076. for (int i = 0; i < QR2_K; ++i) {
  2077. const int sc = scales[2*i];
  2078. const int vi = (v >> (2*i)) & 0x03030303;
  2079. sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
  2080. // fill int with 4x m
  2081. int m = sc >> 4;
  2082. m |= m << 8;
  2083. m |= m << 16;
  2084. sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
  2085. }
  2086. const float2 dm2f = __half22float2(dm2);
  2087. return dm2f.x*sumf_d - dm2f.y*sumf_m;
  2088. #endif
  2089. }
  2090. static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
  2091. const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
  2092. const half2 & dm2, const float & d8) {
  2093. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  2094. int sumi_d = 0;
  2095. int sumi_m = 0;
  2096. #pragma unroll
  2097. for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) {
  2098. int sumi_d_sc = 0;
  2099. const int sc = scales[i0 / (QI8_1/2)];
  2100. // fill int with 4x m
  2101. int m = sc >> 4;
  2102. m |= m << 8;
  2103. m |= m << 16;
  2104. #pragma unroll
  2105. for (int i = i0; i < i0 + QI8_1/2; ++i) {
  2106. sumi_d_sc = __dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product
  2107. sumi_m = __dp4a(m, u[i], sumi_m); // multiply sum of q8_1 values with m
  2108. }
  2109. sumi_d += sumi_d_sc * (sc & 0xF);
  2110. }
  2111. const float2 dm2f = __half22float2(dm2);
  2112. return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
  2113. #endif
  2114. }
  2115. #define VDR_Q3_K_Q8_1_MMVQ 1
  2116. #define VDR_Q3_K_Q8_1_MMQ 2
  2117. // contiguous v/x values
  2118. static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
  2119. const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
  2120. const int & scale_offset, const float & d3, const float * __restrict__ d8) {
  2121. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  2122. float sumf = 0.0f;
  2123. #pragma unroll
  2124. for (int i = 0; i < QR3_K; ++i) {
  2125. const int isc = scale_offset + 2*i;
  2126. const int isc_low = isc % (QK_K/32);
  2127. const int sc_shift_low = 4 * (isc / (QK_K/32));
  2128. const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF;
  2129. const int isc_high = isc % (QK_K/64);
  2130. const int sc_shift_high = 2 * (isc / (QK_K/64));
  2131. const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
  2132. const int sc = (sc_low | sc_high) - 32;
  2133. const int vil = (vl >> (2*i)) & 0x03030303;
  2134. const int vih = ((vh >> i) << 2) & 0x04040404;
  2135. const int vi = __vsubss4(vil, vih);
  2136. sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
  2137. }
  2138. return d3 * sumf;
  2139. #endif
  2140. }
  2141. static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
  2142. const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales,
  2143. const float & d3, const float & d8) {
  2144. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  2145. int sumi = 0;
  2146. #pragma unroll
  2147. for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) {
  2148. int sumi_sc = 0;
  2149. for (int i = i0; i < i0 + QI8_1/2; ++i) {
  2150. sumi_sc = __dp4a(v[i], u[i], sumi_sc); // SIMD dot product
  2151. }
  2152. sumi += sumi_sc * scales[i0 / (QI8_1/2)];
  2153. }
  2154. return d3*d8 * sumi;
  2155. #endif
  2156. }
  2157. #define VDR_Q4_K_Q8_1_MMVQ 2
  2158. #define VDR_Q4_K_Q8_1_MMQ 8
  2159. // contiguous v/x values
  2160. static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
  2161. const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
  2162. const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
  2163. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  2164. float sumf_d = 0.0f;
  2165. float sumf_m = 0.0f;
  2166. #pragma unroll
  2167. for (int i = 0; i < QR4_K; ++i) {
  2168. const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
  2169. const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
  2170. const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
  2171. const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u
  2172. sumf_d += d8[i] * (dot1 * sc[i]);
  2173. sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
  2174. }
  2175. const float2 dm4f = __half22float2(dm4);
  2176. return dm4f.x*sumf_d - dm4f.y*sumf_m;
  2177. #endif
  2178. }
  2179. static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
  2180. const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
  2181. const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
  2182. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  2183. float sumf_d = 0.0f;
  2184. float sumf_m = 0.0f;
  2185. #pragma unroll
  2186. for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) {
  2187. int sumi_d = 0;
  2188. #pragma unroll
  2189. for (int j = 0; j < QI8_1; ++j) {
  2190. sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product
  2191. }
  2192. const float2 ds8f = __half22float2(ds8[i]);
  2193. sumf_d += ds8f.x * (sc[i] * sumi_d);
  2194. sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val
  2195. }
  2196. const float2 dm4f = __half22float2(dm4);
  2197. return dm4f.x*sumf_d - dm4f.y*sumf_m;
  2198. #endif
  2199. }
  2200. #define VDR_Q5_K_Q8_1_MMVQ 2
  2201. #define VDR_Q5_K_Q8_1_MMQ 8
  2202. static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
  2203. const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
  2204. const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
  2205. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  2206. float sumf_d = 0.0f;
  2207. float sumf_m = 0.0f;
  2208. #pragma unroll
  2209. for (int i = 0; i < QR5_K; ++i) {
  2210. const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
  2211. const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
  2212. const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
  2213. const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
  2214. const int v0i = vl0i | vh0i;
  2215. const int v1i = vl1i | vh1i;
  2216. const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
  2217. const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u
  2218. sumf_d += d8[i] * (dot1 * sc[i]);
  2219. sumf_m += d8[i] * (dot2 * m[i]);
  2220. }
  2221. const float2 dm5f = __half22float2(dm5);
  2222. return dm5f.x*sumf_d - dm5f.y*sumf_m;
  2223. #endif
  2224. }
  2225. static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
  2226. const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
  2227. const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
  2228. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  2229. float sumf_d = 0.0f;
  2230. float sumf_m = 0.0f;
  2231. #pragma unroll
  2232. for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) {
  2233. int sumi_d = 0;
  2234. #pragma unroll
  2235. for (int j = 0; j < QI8_1; ++j) {
  2236. sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product
  2237. }
  2238. const float2 ds8f = __half22float2(ds8[i]);
  2239. sumf_d += ds8f.x * (sc[i] * sumi_d);
  2240. sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val
  2241. }
  2242. const float2 dm4f = __half22float2(dm4);
  2243. return dm4f.x*sumf_d - dm4f.y*sumf_m;
  2244. #endif
  2245. }
  2246. #define VDR_Q6_K_Q8_1_MMVQ 1
  2247. #define VDR_Q6_K_Q8_1_MMQ 8
  2248. // contiguous v/x values
  2249. static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
  2250. const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
  2251. const float & d, const float * __restrict__ d8) {
  2252. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  2253. float sumf = 0.0f;
  2254. #pragma unroll
  2255. for (int i = 0; i < QR6_K; ++i) {
  2256. const int sc = scales[4*i];
  2257. const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
  2258. const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
  2259. const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
  2260. sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
  2261. }
  2262. return d*sumf;
  2263. #endif
  2264. }
  2265. static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
  2266. const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc,
  2267. const float & d6, const float * __restrict__ d8) {
  2268. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  2269. float sumf_d = 0.0f;
  2270. #pragma unroll
  2271. for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) {
  2272. int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale
  2273. #pragma unroll
  2274. for (int i = i0; i < i0 + 2; ++i) {
  2275. sumi_d.x = __dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product
  2276. sumi_d.x = __dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product
  2277. sumi_d.y = __dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product
  2278. sumi_d.y = __dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product
  2279. }
  2280. sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y);
  2281. }
  2282. return d6 * sumf_d;
  2283. #endif
  2284. }
  2285. static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
  2286. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  2287. const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
  2288. int v[VDR_Q4_0_Q8_1_MMVQ];
  2289. int u[2*VDR_Q4_0_Q8_1_MMVQ];
  2290. #pragma unroll
  2291. for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
  2292. v[i] = get_int_from_uint8(bq4_0->qs, iqs + i);
  2293. u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
  2294. u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
  2295. }
  2296. return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, __half2float(bq4_0->d), bq8_1->ds);
  2297. }
  2298. template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
  2299. __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
  2300. __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
  2301. *x_ql = tile_x_qs;
  2302. *x_dm = (half2 *) tile_x_d;
  2303. }
  2304. template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
  2305. const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
  2306. int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
  2307. const int kbx = k / QI4_0;
  2308. const int kqsx = k % QI4_0;
  2309. const block_q4_0 * bx0 = (const block_q4_0 *) vx;
  2310. float * x_dmf = (float *) x_dm;
  2311. #pragma unroll
  2312. for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
  2313. int i = i0 + i_offset;
  2314. if (need_check) {
  2315. i = min(i, i_max);
  2316. }
  2317. const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
  2318. x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
  2319. // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
  2320. }
  2321. const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
  2322. const int kbxd = k % blocks_per_tile_x_row;
  2323. #pragma unroll
  2324. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
  2325. int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
  2326. if (need_check) {
  2327. i = min(i, i_max);
  2328. }
  2329. const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
  2330. x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = __half2float(bxi->d);
  2331. }
  2332. }
  2333. static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
  2334. const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
  2335. const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
  2336. (void)x_qh; (void)x_sc;
  2337. const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
  2338. const float * x_dmf = (const float *) x_dm;
  2339. int u[2*VDR_Q4_0_Q8_1_MMQ];
  2340. #pragma unroll
  2341. for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
  2342. u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
  2343. u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
  2344. }
  2345. return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
  2346. (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
  2347. y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
  2348. }
  2349. static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
  2350. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  2351. const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
  2352. int v[VDR_Q4_1_Q8_1_MMVQ];
  2353. int u[2*VDR_Q4_1_Q8_1_MMVQ];
  2354. #pragma unroll
  2355. for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) {
  2356. v[i] = get_int_from_uint8_aligned(bq4_1->qs, iqs + i);
  2357. u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
  2358. u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1);
  2359. }
  2360. return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
  2361. }
  2362. template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
  2363. __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y];
  2364. __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
  2365. *x_ql = tile_x_qs;
  2366. *x_dm = tile_x_dm;
  2367. }
  2368. template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
  2369. const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
  2370. int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
  2371. const int kbx = k / QI4_1;
  2372. const int kqsx = k % QI4_1;
  2373. const block_q4_1 * bx0 = (const block_q4_1 *) vx;
  2374. #pragma unroll
  2375. for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
  2376. int i = i0 + i_offset;
  2377. if (need_check) {
  2378. i = min(i, i_max);
  2379. }
  2380. const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx;
  2381. x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
  2382. }
  2383. const int blocks_per_tile_x_row = WARP_SIZE / QI4_1;
  2384. const int kbxd = k % blocks_per_tile_x_row;
  2385. #pragma unroll
  2386. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
  2387. int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
  2388. if (need_check) {
  2389. i = min(i, i_max);
  2390. }
  2391. const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd;
  2392. x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm;
  2393. }
  2394. }
  2395. static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
  2396. const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
  2397. const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
  2398. const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
  2399. int u[2*VDR_Q4_1_Q8_1_MMQ];
  2400. #pragma unroll
  2401. for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
  2402. u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
  2403. u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
  2404. }
  2405. return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
  2406. (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
  2407. y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
  2408. }
  2409. static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
  2410. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  2411. const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
  2412. int vl[VDR_Q5_0_Q8_1_MMVQ];
  2413. int vh[VDR_Q5_0_Q8_1_MMVQ];
  2414. int u[2*VDR_Q5_0_Q8_1_MMVQ];
  2415. #pragma unroll
  2416. for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) {
  2417. vl[i] = get_int_from_uint8(bq5_0->qs, iqs + i);
  2418. vh[i] = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i));
  2419. u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
  2420. u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0);
  2421. }
  2422. return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, __half2float(bq5_0->d), bq8_1->ds);
  2423. }
  2424. template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
  2425. __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
  2426. __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
  2427. *x_ql = tile_x_ql;
  2428. *x_dm = (half2 *) tile_x_d;
  2429. }
  2430. template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
  2431. const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
  2432. int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
  2433. const int kbx = k / QI5_0;
  2434. const int kqsx = k % QI5_0;
  2435. const block_q5_0 * bx0 = (const block_q5_0 *) vx;
  2436. #pragma unroll
  2437. for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
  2438. int i = i0 + i_offset;
  2439. if (need_check) {
  2440. i = min(i, i_max);
  2441. }
  2442. const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx;
  2443. const int ql = get_int_from_uint8(bxi->qs, kqsx);
  2444. const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0));
  2445. int qs0 = (ql >> 0) & 0x0F0F0F0F;
  2446. qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
  2447. qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
  2448. qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
  2449. qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
  2450. qs0 = __vsubss4(qs0, 0x10101010); // subtract 16
  2451. x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
  2452. int qs1 = (ql >> 4) & 0x0F0F0F0F;
  2453. qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
  2454. qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
  2455. qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
  2456. qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
  2457. qs1 = __vsubss4(qs1, 0x10101010); // subtract 16
  2458. x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
  2459. }
  2460. const int blocks_per_tile_x_row = WARP_SIZE / QI5_0;
  2461. const int kbxd = k % blocks_per_tile_x_row;
  2462. float * x_dmf = (float *) x_dm;
  2463. #pragma unroll
  2464. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
  2465. int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
  2466. if (need_check) {
  2467. i = min(i, i_max);
  2468. }
  2469. const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd;
  2470. x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = __half2float(bxi->d);
  2471. }
  2472. }
  2473. static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
  2474. const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
  2475. const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
  2476. const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
  2477. const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
  2478. const float * x_dmf = (const float *) x_dm;
  2479. const float * y_df = (const float *) y_ds;
  2480. int u[2*VDR_Q5_0_Q8_1_MMQ];
  2481. #pragma unroll
  2482. for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
  2483. u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
  2484. u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
  2485. }
  2486. return vec_dot_q8_0_q8_1_impl<QR5_0*VDR_Q5_0_Q8_1_MMQ>
  2487. (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
  2488. }
  2489. static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
  2490. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  2491. const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
  2492. int vl[VDR_Q5_1_Q8_1_MMVQ];
  2493. int vh[VDR_Q5_1_Q8_1_MMVQ];
  2494. int u[2*VDR_Q5_1_Q8_1_MMVQ];
  2495. #pragma unroll
  2496. for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) {
  2497. vl[i] = get_int_from_uint8_aligned(bq5_1->qs, iqs + i);
  2498. vh[i] = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i));
  2499. u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
  2500. u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1);
  2501. }
  2502. return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
  2503. }
  2504. template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
  2505. __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
  2506. __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
  2507. *x_ql = tile_x_ql;
  2508. *x_dm = tile_x_dm;
  2509. }
  2510. template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
  2511. const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
  2512. int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
  2513. const int kbx = k / QI5_1;
  2514. const int kqsx = k % QI5_1;
  2515. const block_q5_1 * bx0 = (const block_q5_1 *) vx;
  2516. #pragma unroll
  2517. for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
  2518. int i = i0 + i_offset;
  2519. if (need_check) {
  2520. i = min(i, i_max);
  2521. }
  2522. const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx;
  2523. const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
  2524. const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1));
  2525. int qs0 = (ql >> 0) & 0x0F0F0F0F;
  2526. qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
  2527. qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
  2528. qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
  2529. qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
  2530. x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
  2531. int qs1 = (ql >> 4) & 0x0F0F0F0F;
  2532. qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
  2533. qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
  2534. qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
  2535. qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
  2536. x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
  2537. }
  2538. const int blocks_per_tile_x_row = WARP_SIZE / QI5_1;
  2539. const int kbxd = k % blocks_per_tile_x_row;
  2540. #pragma unroll
  2541. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
  2542. int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
  2543. if (need_check) {
  2544. i = min(i, i_max);
  2545. }
  2546. const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd;
  2547. x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm;
  2548. }
  2549. }
  2550. static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
  2551. const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
  2552. const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
  2553. const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
  2554. const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
  2555. int u[2*VDR_Q5_1_Q8_1_MMQ];
  2556. #pragma unroll
  2557. for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
  2558. u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
  2559. u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
  2560. }
  2561. return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
  2562. (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
  2563. }
  2564. static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
  2565. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  2566. const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
  2567. int v[VDR_Q8_0_Q8_1_MMVQ];
  2568. int u[VDR_Q8_0_Q8_1_MMVQ];
  2569. #pragma unroll
  2570. for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
  2571. v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
  2572. u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
  2573. }
  2574. return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, __half2float(bq8_0->d), __low2float(bq8_1->ds));
  2575. }
  2576. template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
  2577. __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
  2578. __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
  2579. *x_ql = tile_x_qs;
  2580. *x_dm = (half2 *) tile_x_d;
  2581. }
  2582. template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
  2583. const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
  2584. int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
  2585. const int kbx = k / QI8_0;
  2586. const int kqsx = k % QI8_0;
  2587. float * x_dmf = (float *) x_dm;
  2588. const block_q8_0 * bx0 = (const block_q8_0 *) vx;
  2589. #pragma unroll
  2590. for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
  2591. int i = i0 + i_offset;
  2592. if (need_check) {
  2593. i = min(i, i_max);
  2594. }
  2595. const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
  2596. x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
  2597. }
  2598. const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
  2599. const int kbxd = k % blocks_per_tile_x_row;
  2600. #pragma unroll
  2601. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
  2602. int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
  2603. if (need_check) {
  2604. i = min(i, i_max);
  2605. }
  2606. const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
  2607. x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = __half2float(bxi->d);
  2608. }
  2609. }
  2610. static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
  2611. const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
  2612. const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
  2613. const float * x_dmf = (const float *) x_dm;
  2614. const float * y_df = (const float *) y_ds;
  2615. return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ>
  2616. (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
  2617. y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
  2618. }
  2619. static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
  2620. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  2621. const block_q2_K * bq2_K = (const block_q2_K *) vbq;
  2622. const int bq8_offset = QR2_K * (iqs / QI8_1);
  2623. const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
  2624. const uint8_t * scales = bq2_K->scales + scale_offset;
  2625. const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
  2626. int u[QR2_K];
  2627. float d8[QR2_K];
  2628. #pragma unroll
  2629. for (int i = 0; i < QR2_K; ++ i) {
  2630. u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
  2631. d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
  2632. }
  2633. return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
  2634. }
  2635. template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
  2636. __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
  2637. __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
  2638. __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4];
  2639. *x_ql = tile_x_ql;
  2640. *x_dm = tile_x_dm;
  2641. *x_sc = tile_x_sc;
  2642. }
  2643. template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
  2644. const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
  2645. int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
  2646. const int kbx = k / QI2_K;
  2647. const int kqsx = k % QI2_K;
  2648. const block_q2_K * bx0 = (const block_q2_K *) vx;
  2649. #pragma unroll
  2650. for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
  2651. int i = i0 + i_offset;
  2652. if (need_check) {
  2653. i = min(i, i_max);
  2654. }
  2655. const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx;
  2656. x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
  2657. }
  2658. const int blocks_per_tile_x_row = WARP_SIZE / QI2_K;
  2659. const int kbxd = k % blocks_per_tile_x_row;
  2660. #pragma unroll
  2661. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
  2662. int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y;
  2663. if (need_check) {
  2664. i = min(i, i_max);
  2665. }
  2666. const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd;
  2667. x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm;
  2668. }
  2669. #pragma unroll
  2670. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
  2671. int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
  2672. if (need_check) {
  2673. i = min(i, i_max);
  2674. }
  2675. const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4);
  2676. x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4));
  2677. }
  2678. }
  2679. static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
  2680. const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
  2681. const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
  2682. const int kbx = k / QI2_K;
  2683. const int ky = (k % QI2_K) * QR2_K;
  2684. const float * y_df = (const float *) y_ds;
  2685. int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
  2686. const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
  2687. const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
  2688. #pragma unroll
  2689. for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) {
  2690. v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
  2691. }
  2692. const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
  2693. const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE;
  2694. return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
  2695. }
  2696. static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
  2697. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  2698. const block_q3_K * bq3_K = (const block_q3_K *) vbq;
  2699. const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
  2700. const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
  2701. const float d = __half2float(bq3_K->d);
  2702. const int vl = get_int_from_uint8(bq3_K->qs, iqs);
  2703. // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
  2704. const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
  2705. int u[QR3_K];
  2706. float d8[QR3_K];
  2707. #pragma unroll
  2708. for (int i = 0; i < QR3_K; ++i) {
  2709. u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
  2710. d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
  2711. }
  2712. return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
  2713. }
  2714. template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
  2715. __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
  2716. __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K];
  2717. __shared__ int tile_x_qh[mmq_y * (WARP_SIZE/2) + mmq_y/2];
  2718. __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4];
  2719. *x_ql = tile_x_ql;
  2720. *x_dm = tile_x_dm;
  2721. *x_qh = tile_x_qh;
  2722. *x_sc = tile_x_sc;
  2723. }
  2724. template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
  2725. const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
  2726. int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
  2727. const int kbx = k / QI3_K;
  2728. const int kqsx = k % QI3_K;
  2729. const block_q3_K * bx0 = (const block_q3_K *) vx;
  2730. #pragma unroll
  2731. for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
  2732. int i = i0 + i_offset;
  2733. if (need_check) {
  2734. i = min(i, i_max);
  2735. }
  2736. const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx;
  2737. x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
  2738. }
  2739. const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
  2740. const int kbxd = k % blocks_per_tile_x_row;
  2741. float * x_dmf = (float *) x_dm;
  2742. #pragma unroll
  2743. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
  2744. int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y;
  2745. if (need_check) {
  2746. i = min(i, i_max);
  2747. }
  2748. const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
  2749. x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = __half2float(bxi->d);
  2750. }
  2751. #pragma unroll
  2752. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
  2753. int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
  2754. if (need_check) {
  2755. i = min(i, i_max);
  2756. }
  2757. const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2);
  2758. // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
  2759. x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
  2760. }
  2761. #pragma unroll
  2762. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
  2763. int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
  2764. if (need_check) {
  2765. i = min(i, i_max);
  2766. }
  2767. const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4);
  2768. const int ksc = k % (QI3_K/4);
  2769. const int ksc_low = ksc % (QI3_K/8);
  2770. const int shift_low = 4 * (ksc / (QI3_K/8));
  2771. const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;
  2772. const int ksc_high = QI3_K/8;
  2773. const int shift_high = 2 * ksc;
  2774. const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;
  2775. const int sc = __vsubss4(sc_low | sc_high, 0x20202020);
  2776. x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc;
  2777. }
  2778. }
  2779. static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
  2780. const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
  2781. const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
  2782. const int kbx = k / QI3_K;
  2783. const int ky = (k % QI3_K) * QR3_K;
  2784. const float * x_dmf = (const float *) x_dm;
  2785. const float * y_df = (const float *) y_ds;
  2786. const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
  2787. int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
  2788. #pragma unroll
  2789. for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
  2790. const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
  2791. const int shift = 2 * ((ky % 32) / 8);
  2792. const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
  2793. const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
  2794. const int vlh = (vh << 2) & 0x04040404;
  2795. v[l] = __vsubss4(vll, vlh);
  2796. }
  2797. const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE;
  2798. return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
  2799. }
  2800. static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
  2801. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  2802. const block_q4_K * bq4_K = (const block_q4_K *) vbq;
  2803. int v[2];
  2804. int u[2*QR4_K];
  2805. float d8[QR4_K];
  2806. // iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6
  2807. const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2));
  2808. // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
  2809. // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
  2810. // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
  2811. // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
  2812. const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
  2813. v[0] = q4[0];
  2814. v[1] = q4[4];
  2815. const uint16_t * scales = (const uint16_t *)bq4_K->scales;
  2816. uint16_t aux[2];
  2817. const int j = bq8_offset/2;
  2818. if (j < 2) {
  2819. aux[0] = scales[j+0] & 0x3f3f;
  2820. aux[1] = scales[j+2] & 0x3f3f;
  2821. } else {
  2822. aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
  2823. aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
  2824. }
  2825. const uint8_t * sc = (const uint8_t *)aux;
  2826. const uint8_t * m = sc + 2;
  2827. for (int i = 0; i < QR4_K; ++i) {
  2828. const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
  2829. d8[i] = __low2float(bq8i->ds);
  2830. const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
  2831. u[2*i+0] = q8[0];
  2832. u[2*i+1] = q8[4];
  2833. }
  2834. return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
  2835. }
  2836. template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
  2837. __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
  2838. __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
  2839. __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
  2840. *x_ql = tile_x_ql;
  2841. *x_dm = tile_x_dm;
  2842. *x_sc = tile_x_sc;
  2843. }
  2844. template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
  2845. const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
  2846. int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
  2847. const int kbx = k / QI4_K; // == 0 if QK_K == 256
  2848. const int kqsx = k % QI4_K; // == k if QK_K == 256
  2849. const block_q4_K * bx0 = (const block_q4_K *) vx;
  2850. #pragma unroll
  2851. for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
  2852. int i = i0 + i_offset;
  2853. if (need_check) {
  2854. i = min(i, i_max);
  2855. }
  2856. const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx;
  2857. x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
  2858. }
  2859. const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
  2860. const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
  2861. #pragma unroll
  2862. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
  2863. int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y;
  2864. if (need_check) {
  2865. i = min(i, i_max);
  2866. }
  2867. const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
  2868. x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
  2869. }
  2870. #pragma unroll
  2871. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
  2872. int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
  2873. if (need_check) {
  2874. i = min(i, i_max);
  2875. }
  2876. const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
  2877. const int * scales = (const int *) bxi->scales;
  2878. const int ksc = k % (WARP_SIZE/8);
  2879. // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
  2880. int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
  2881. scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
  2882. x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
  2883. }
  2884. }
  2885. static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
  2886. const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
  2887. const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
  2888. (void)x_qh;
  2889. const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
  2890. const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
  2891. return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8,
  2892. x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
  2893. }
  2894. static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
  2895. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  2896. const block_q5_K * bq5_K = (const block_q5_K *) vbq;
  2897. int vl[2];
  2898. int vh[2];
  2899. int u[2*QR5_K];
  2900. float d8[QR5_K];
  2901. const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2));
  2902. const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
  2903. const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4));
  2904. vl[0] = ql[0];
  2905. vl[1] = ql[4];
  2906. vh[0] = qh[0] >> bq8_offset;
  2907. vh[1] = qh[4] >> bq8_offset;
  2908. const uint16_t * scales = (const uint16_t *)bq5_K->scales;
  2909. uint16_t aux[2];
  2910. const int j = bq8_offset/2;
  2911. if (j < 2) {
  2912. aux[0] = scales[j+0] & 0x3f3f;
  2913. aux[1] = scales[j+2] & 0x3f3f;
  2914. } else {
  2915. aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
  2916. aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
  2917. }
  2918. const uint8_t * sc = (const uint8_t *)aux;
  2919. const uint8_t * m = sc + 2;
  2920. #pragma unroll
  2921. for (int i = 0; i < QR5_K; ++i) {
  2922. const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
  2923. d8[i] = __low2float(bq8i->ds);
  2924. const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
  2925. u[2*i+0] = q8[0];
  2926. u[2*i+1] = q8[4];
  2927. }
  2928. return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
  2929. }
  2930. template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
  2931. __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
  2932. __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
  2933. __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
  2934. *x_ql = tile_x_ql;
  2935. *x_dm = tile_x_dm;
  2936. *x_sc = tile_x_sc;
  2937. }
  2938. template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
  2939. const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
  2940. int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
  2941. const int kbx = k / QI5_K; // == 0 if QK_K == 256
  2942. const int kqsx = k % QI5_K; // == k if QK_K == 256
  2943. const block_q5_K * bx0 = (const block_q5_K *) vx;
  2944. #pragma unroll
  2945. for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
  2946. int i = i0 + i_offset;
  2947. if (need_check) {
  2948. i = min(i, i_max);
  2949. }
  2950. const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
  2951. const int ky = QR5_K*kqsx;
  2952. const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
  2953. const int ql0 = (ql >> 0) & 0x0F0F0F0F;
  2954. const int ql1 = (ql >> 4) & 0x0F0F0F0F;
  2955. const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4));
  2956. const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010;
  2957. const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010;
  2958. const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0;
  2959. const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4);
  2960. x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0;
  2961. x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
  2962. }
  2963. const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
  2964. const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
  2965. #pragma unroll
  2966. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
  2967. int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y;
  2968. if (need_check) {
  2969. i = min(i, i_max);
  2970. }
  2971. const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
  2972. x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
  2973. }
  2974. #pragma unroll
  2975. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
  2976. int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
  2977. if (need_check) {
  2978. i = min(i, i_max);
  2979. }
  2980. const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
  2981. const int * scales = (const int *) bxi->scales;
  2982. const int ksc = k % (WARP_SIZE/8);
  2983. // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
  2984. int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
  2985. scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
  2986. x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
  2987. }
  2988. }
  2989. static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
  2990. const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
  2991. const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
  2992. const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
  2993. const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k;
  2994. const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE;
  2995. return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
  2996. x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
  2997. }
  2998. static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
  2999. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  3000. const block_q6_K * bq6_K = (const block_q6_K *) vbq;
  3001. const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
  3002. const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
  3003. const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
  3004. const int vl = get_int_from_uint8(bq6_K->ql, iqs);
  3005. const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift;
  3006. const int8_t * scales = bq6_K->scales + scale_offset;
  3007. int u[QR6_K];
  3008. float d8[QR6_K];
  3009. #pragma unroll
  3010. for (int i = 0; i < QR6_K; ++i) {
  3011. u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
  3012. d8[i] = __low2float(bq8_1[bq8_offset + 2*i].ds);
  3013. }
  3014. return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, __half2float(bq6_K->d), d8);
  3015. }
  3016. template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
  3017. __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
  3018. __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
  3019. __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
  3020. *x_ql = tile_x_ql;
  3021. *x_dm = tile_x_dm;
  3022. *x_sc = tile_x_sc;
  3023. }
  3024. template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
  3025. const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
  3026. int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
  3027. const int kbx = k / QI6_K; // == 0 if QK_K == 256
  3028. const int kqsx = k % QI6_K; // == k if QK_K == 256
  3029. const block_q6_K * bx0 = (const block_q6_K *) vx;
  3030. #pragma unroll
  3031. for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
  3032. int i = i0 + i_offset;
  3033. if (need_check) {
  3034. i = min(i, i_max);
  3035. }
  3036. const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
  3037. const int ky = QR6_K*kqsx;
  3038. const int ql = get_int_from_uint8(bxi->ql, kqsx);
  3039. const int ql0 = (ql >> 0) & 0x0F0F0F0F;
  3040. const int ql1 = (ql >> 4) & 0x0F0F0F0F;
  3041. const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4));
  3042. const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030;
  3043. const int qh1 = (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) & 0x30303030;
  3044. const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0;
  3045. const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2);
  3046. x_ql[i * (2*WARP_SIZE + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
  3047. x_ql[i * (2*WARP_SIZE + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
  3048. }
  3049. const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
  3050. const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
  3051. float * x_dmf = (float *) x_dm;
  3052. #pragma unroll
  3053. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
  3054. int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y;
  3055. if (need_check) {
  3056. i = min(i, i_max);
  3057. }
  3058. const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
  3059. x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = __half2float(bxi->d);
  3060. }
  3061. #pragma unroll
  3062. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
  3063. int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
  3064. if (need_check) {
  3065. i = min(i, i_max);
  3066. }
  3067. const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4;
  3068. x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8));
  3069. }
  3070. }
  3071. static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
  3072. const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
  3073. const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
  3074. const float * x_dmf = (const float *) x_dm;
  3075. const float * y_df = (const float *) y_ds;
  3076. const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]);
  3077. const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k;
  3078. const int index_y = j * WARP_SIZE + (QR6_K*k) % WARP_SIZE;
  3079. return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
  3080. }
  3081. static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
  3082. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  3083. const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
  3084. const int ib32 = iqs;
  3085. const uint16_t * q2 = bq2->qs + 4*ib32;
  3086. const uint8_t * aux8 = (const uint8_t *)q2;
  3087. const int8_t * q8 = bq8_1[ib32].qs;
  3088. uint32_t aux32 = q2[2] | (q2[3] << 16);
  3089. int sumi = 0;
  3090. for (int l = 0; l < 4; ++l) {
  3091. const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
  3092. const uint8_t signs = ksigns_iq2xs[aux32 & 127];
  3093. for (int j = 0; j < 8; ++j) {
  3094. sumi += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
  3095. }
  3096. q8 += 8;
  3097. aux32 >>= 7;
  3098. }
  3099. const float d = __half2float(bq2->d) * (0.5f + aux32) * __half2float(bq8_1[ib32].ds.x) * 0.25f;
  3100. return d * sumi;
  3101. }
  3102. static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
  3103. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  3104. const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
  3105. const int ib32 = iqs;
  3106. const uint16_t * q2 = bq2->qs + 4*ib32;
  3107. const int8_t * q8 = bq8_1[ib32].qs;
  3108. const uint8_t ls1 = bq2->scales[ib32] & 0xf;
  3109. const uint8_t ls2 = bq2->scales[ib32] >> 4;
  3110. int sumi1 = 0;
  3111. for (int l = 0; l < 2; ++l) {
  3112. const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
  3113. const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
  3114. for (int j = 0; j < 8; ++j) {
  3115. sumi1 += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
  3116. }
  3117. q8 += 8;
  3118. }
  3119. int sumi2 = 0;
  3120. for (int l = 2; l < 4; ++l) {
  3121. const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
  3122. const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
  3123. for (int j = 0; j < 8; ++j) {
  3124. sumi2 += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
  3125. }
  3126. q8 += 8;
  3127. }
  3128. const float d = __half2float(bq2->d) * __half2float(bq8_1[ib32].ds.x) * 0.25f;
  3129. return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
  3130. }
  3131. static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
  3132. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  3133. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  3134. const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
  3135. const int ib32 = iqs;
  3136. const int8_t * q8 = bq8_1[ib32].qs;
  3137. const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32;
  3138. const uint8_t ls1 = bq2->scales[ib32] & 0xf;
  3139. const uint8_t ls2 = bq2->scales[ib32] >> 4;
  3140. int sumi1 = 0;
  3141. for (int l = 0; l < 2; ++l) {
  3142. const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
  3143. const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
  3144. const uint32_t signs1 = __vcmpeq4(((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201);
  3145. const int grid_l = __vsub4(grid[0] ^ signs0, signs0);
  3146. const int grid_h = __vsub4(grid[1] ^ signs1, signs1);
  3147. sumi1 = __dp4a(grid_l, *((const int *)q8 + 0), sumi1);
  3148. sumi1 = __dp4a(grid_h, *((const int *)q8 + 1), sumi1);
  3149. q8 += 8;
  3150. }
  3151. int sumi2 = 0;
  3152. for (int l = 2; l < 4; ++l) {
  3153. const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
  3154. const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
  3155. const uint32_t signs1 = __vcmpeq4(((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201);
  3156. const int grid_l = __vsub4(grid[0] ^ signs0, signs0);
  3157. const int grid_h = __vsub4(grid[1] ^ signs1, signs1);
  3158. sumi2 = __dp4a(grid_l, *((const int *)q8 + 0), sumi2);
  3159. sumi2 = __dp4a(grid_h, *((const int *)q8 + 1), sumi2);
  3160. q8 += 8;
  3161. }
  3162. const float d = __half2float(bq2->d) * __low2float(bq8_1[ib32].ds) * 0.25f;
  3163. return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
  3164. #endif
  3165. }
  3166. static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
  3167. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  3168. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  3169. const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
  3170. const int ib32 = iqs;
  3171. const uint8_t * q3 = bq2->qs + 8*ib32;
  3172. const uint16_t * gas = (const uint16_t *)(bq2->qs + QK_K/4) + 2*ib32;
  3173. const int8_t * q8 = bq8_1[ib32].qs;
  3174. uint32_t aux32 = gas[0] | (gas[1] << 16);
  3175. int sumi = 0;
  3176. for (int l = 0; l < 4; ++l) {
  3177. const uint32_t * grid1 = iq3xxs_grid + q3[2*l+0];
  3178. const uint32_t * grid2 = iq3xxs_grid + q3[2*l+1];
  3179. const uint32_t * signs = (const uint32_t *)(ksigns64 + (aux32 & 127));
  3180. const int grid_l = __vsub4(grid1[0] ^ signs[0], signs[0]);
  3181. const int grid_h = __vsub4(grid2[0] ^ signs[1], signs[1]);
  3182. sumi = __dp4a(grid_l, *((int *)q8+0), sumi);
  3183. sumi = __dp4a(grid_h, *((int *)q8+1), sumi);
  3184. q8 += 8;
  3185. aux32 >>= 7;
  3186. }
  3187. const float d = __half2float(bq2->d) * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.5f;
  3188. return d * sumi;
  3189. #endif
  3190. }
  3191. static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
  3192. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  3193. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  3194. const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
  3195. const int ib32 = iqs;
  3196. const uint8_t * qs = bq2->qs + 8*ib32;
  3197. const int8_t * q8 = bq8_1[ib32].qs;
  3198. int sumi = 0;
  3199. for (int l = 0; l < 4; ++l) {
  3200. const uint32_t * grid1 = iq3xs_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256));
  3201. const uint32_t * grid2 = iq3xs_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256));
  3202. uint32_t signs0 = __vcmpeq4(((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
  3203. uint32_t signs1 = __vcmpeq4(((bq2->signs[4*ib32+l] >> 4) * 0x01010101) & 0x08040201, 0x08040201);
  3204. const int grid_l = __vsub4(grid1[0] ^ signs0, signs0);
  3205. const int grid_h = __vsub4(grid2[0] ^ signs1, signs1);
  3206. sumi = __dp4a(grid_l, *((int *)q8+0), sumi);
  3207. sumi = __dp4a(grid_h, *((int *)q8+1), sumi);
  3208. q8 += 8;
  3209. }
  3210. const float d = __half2float(bq2->d) * (0.5f + ((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * __low2float(bq8_1[ib32].ds) * 0.5f;
  3211. return d * sumi;
  3212. #endif
  3213. }
  3214. static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
  3215. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  3216. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  3217. const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
  3218. const int ib32 = iqs;
  3219. int sumi1 = 0, sumi2 = 0, sumi3 = 0, sumi4 = 0;
  3220. const uint8_t h1 = bq1->scales[2*ib32+0];
  3221. const uint8_t h2 = bq1->scales[2*ib32+1];
  3222. const int * q8 = (const int *)bq8_1[ib32].qs;
  3223. const int * grid1 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+0] | ((h1 & 0x08) << 5)));
  3224. const int * grid2 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+1] | ((h1 & 0x80) << 1)));
  3225. const int * grid3 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+2] | ((h2 & 0x08) << 5)));
  3226. const int * grid4 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+3] | ((h2 & 0x80) << 1)));
  3227. for (int j = 0; j < 2; ++j) {
  3228. sumi1 = __dp4a(q8[j+0], grid1[j], sumi1);
  3229. sumi2 = __dp4a(q8[j+2], grid2[j], sumi2);
  3230. sumi3 = __dp4a(q8[j+4], grid3[j], sumi3);
  3231. sumi4 = __dp4a(q8[j+6], grid4[j], sumi4);
  3232. }
  3233. const float d = __half2float(bq1->d) * __low2float(bq8_1[ib32].ds);
  3234. return d * (sumi1 * (2*(h1 & 7) + 1) + sumi2 * (2*((h1 >> 4) & 7) + 1) +
  3235. sumi3 * (2*(h2 & 7) + 1) + sumi4 * (2*((h2 >> 4) & 7) + 1));
  3236. #endif
  3237. }
  3238. static __device__ __forceinline__ void get_int_from_table_16(const uint32_t & q4, const uint8_t * values,
  3239. int & val1, int & val2) {
  3240. uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32;
  3241. aux32 = q4 & 0x0f0f0f0f;
  3242. uint16_t v1 = values[q8[0]] | (values[q8[1]] << 8);
  3243. uint16_t v2 = values[q8[2]] | (values[q8[3]] << 8);
  3244. val1 = v1 | (v2 << 16);
  3245. aux32 = (q4 >> 4) & 0x0f0f0f0f;
  3246. v1 = values[q8[0]] | (values[q8[1]] << 8);
  3247. v2 = values[q8[2]] | (values[q8[3]] << 8);
  3248. val2 = v1 | (v2 << 16);
  3249. }
  3250. static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
  3251. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  3252. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  3253. const block_iq4_nl * bq = (const block_iq4_nl *) vbq;
  3254. const uint16_t * q4 = (const uint16_t *)bq->qs + 2*iqs;
  3255. const int32_t * q8 = (const int32_t *)bq8_1->qs + iqs;
  3256. const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
  3257. int v1, v2;
  3258. int sumi1 = 0, sumi2 = 0;
  3259. for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) {
  3260. const uint32_t aux = q4[2*l] | (q4[2*l+1] << 16);
  3261. get_int_from_table_16(aux, values, v1, v2);
  3262. sumi1 = __dp4a(v1, q8[l+0], sumi1);
  3263. sumi2 = __dp4a(v2, q8[l+4], sumi2);
  3264. }
  3265. const float d = __half2float(bq->d) * __low2float(bq8_1->ds);
  3266. return d * (sumi1 + sumi2);
  3267. #endif
  3268. }
  3269. static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
  3270. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  3271. #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
  3272. const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
  3273. const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
  3274. // iqs is 0...7
  3275. const int ib32 = iqs;
  3276. const int32_t * q8 = (const int *)bq8_1[ib32].qs;
  3277. const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32;
  3278. const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4);
  3279. const float d = __half2float(bq4->d) * (ls - 32) * __low2float(bq8_1[ib32].ds);
  3280. int v1, v2;
  3281. int sumi1 = 0, sumi2 = 0;
  3282. for (int j = 0; j < 4; ++j) {
  3283. get_int_from_table_16(q4[j], values, v1, v2);
  3284. sumi1 = __dp4a(v1, q8[j+0], sumi1);
  3285. sumi2 = __dp4a(v2, q8[j+4], sumi2);
  3286. }
  3287. return d * (sumi1 + sumi2);
  3288. #endif
  3289. }
  3290. template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
  3291. static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst, const int ncols, const int nrows) {
  3292. const int row = blockIdx.x*blockDim.y + threadIdx.y;
  3293. if (row >= nrows) {
  3294. return;
  3295. }
  3296. const int blocks_per_row = ncols / qk;
  3297. const int blocks_per_warp = vdr * WARP_SIZE / qi;
  3298. // partial sum for each thread
  3299. float tmp = 0.0f;
  3300. const block_q_t * x = (const block_q_t *) vx;
  3301. const block_q8_1 * y = (const block_q8_1 *) vy;
  3302. for (int i = threadIdx.x / (qi/vdr); i < blocks_per_row; i += blocks_per_warp) {
  3303. const int ibx = row*blocks_per_row + i; // x block index
  3304. const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
  3305. const int iqs = vdr * (threadIdx.x % (qi/vdr)); // x block quant index when casting the quants to int
  3306. tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
  3307. }
  3308. // sum up partial sums and write back result
  3309. #pragma unroll
  3310. for (int mask = 16; mask > 0; mask >>= 1) {
  3311. tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
  3312. }
  3313. if (threadIdx.x == 0) {
  3314. dst[row] = __float2half(tmp);
  3315. }
  3316. }
  3317. static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
  3318. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  3319. const dim3 block_nums(block_num_y, 1, 1);
  3320. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  3321. mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
  3322. <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
  3323. }
  3324. static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
  3325. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  3326. const dim3 block_nums(block_num_y, 1, 1);
  3327. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  3328. mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
  3329. <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
  3330. }
  3331. static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
  3332. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  3333. const dim3 block_nums(block_num_y, 1, 1);
  3334. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  3335. mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
  3336. <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
  3337. }
  3338. static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
  3339. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  3340. const dim3 block_nums(block_num_y, 1, 1);
  3341. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  3342. mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
  3343. <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
  3344. }
  3345. static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
  3346. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  3347. const dim3 block_nums(block_num_y, 1, 1);
  3348. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  3349. mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
  3350. <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
  3351. }
  3352. static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
  3353. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  3354. const dim3 block_nums(block_num_y, 1, 1);
  3355. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  3356. mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
  3357. <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
  3358. }
  3359. static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
  3360. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  3361. const dim3 block_nums(block_num_y, 1, 1);
  3362. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  3363. mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
  3364. <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
  3365. }
  3366. static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
  3367. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  3368. const dim3 block_nums(block_num_y, 1, 1);
  3369. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  3370. mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
  3371. <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
  3372. }
  3373. static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
  3374. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  3375. const dim3 block_nums(block_num_y, 1, 1);
  3376. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  3377. mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
  3378. <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
  3379. }
  3380. static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
  3381. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  3382. const dim3 block_nums(block_num_y, 1, 1);
  3383. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  3384. mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
  3385. <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
  3386. }
  3387. static void mul_mat_vec_iq2_xxs_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
  3388. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  3389. const dim3 block_nums(block_num_y, 1, 1);
  3390. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  3391. mul_mat_vec_q<QK_K, QI2_XXS, block_iq2_xxs, 1, vec_dot_iq2_xxs_q8_1>
  3392. <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
  3393. }
  3394. static void mul_mat_vec_iq2_xs_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
  3395. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  3396. const dim3 block_nums(block_num_y, 1, 1);
  3397. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  3398. mul_mat_vec_q<QK_K, QI2_XS, block_iq2_xs, 1, vec_dot_iq2_xs_q8_1>
  3399. <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
  3400. }
  3401. static void mul_mat_vec_iq2_s_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
  3402. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  3403. const dim3 block_nums(block_num_y, 1, 1);
  3404. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  3405. mul_mat_vec_q<QK_K, QI2_S, block_iq2_s, 1, vec_dot_iq2_s_q8_1>
  3406. <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
  3407. }
  3408. static void mul_mat_vec_iq3_xxs_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
  3409. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  3410. const dim3 block_nums(block_num_y, 1, 1);
  3411. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  3412. mul_mat_vec_q<QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
  3413. <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
  3414. }
  3415. static void mul_mat_vec_iq1_s_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
  3416. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  3417. const dim3 block_nums(block_num_y, 1, 1);
  3418. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  3419. mul_mat_vec_q<QK_K, QI1_S, block_iq1_s, 1, vec_dot_iq1_s_q8_1>
  3420. <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
  3421. }
  3422. static void mul_mat_vec_iq4_nl_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
  3423. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  3424. const dim3 block_nums(block_num_y, 1, 1);
  3425. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  3426. mul_mat_vec_q<QK4_NL, QI4_NL, block_iq4_nl, VDR_Q4_0_Q8_1_MMVQ, vec_dot_iq4_nl_q8_1>
  3427. <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
  3428. }
  3429. static void mul_mat_vec_iq4_xs_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
  3430. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  3431. const dim3 block_nums(block_num_y, 1, 1);
  3432. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  3433. mul_mat_vec_q<QK_K, QI4_XS, block_iq4_xs, 1, vec_dot_iq4_xs_q8_1>
  3434. <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
  3435. }
  3436. static void mul_mat_vec_iq3_s_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
  3437. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  3438. const dim3 block_nums(block_num_y, 1, 1);
  3439. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  3440. mul_mat_vec_q<QK_K, QI3_XS, block_iq3_s, 1, vec_dot_iq3_s_q8_1>
  3441. <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
  3442. }
  3443. template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
  3444. allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
  3445. static __device__ __forceinline__ void mul_mat_q(
  3446. const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
  3447. const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
  3448. const block_q_t * x = (const block_q_t *) vx;
  3449. const block_q8_1 * y = (const block_q8_1 *) vy;
  3450. const int blocks_per_row_x = ncols_x / qk;
  3451. const int blocks_per_col_y = nrows_y / QK8_1;
  3452. const int blocks_per_warp = WARP_SIZE / qi;
  3453. const int & ncols_dst = ncols_y;
  3454. const int row_dst_0 = blockIdx.x*mmq_y;
  3455. const int & row_x_0 = row_dst_0;
  3456. const int col_dst_0 = blockIdx.y*mmq_x;
  3457. const int & col_y_0 = col_dst_0;
  3458. int * tile_x_ql = nullptr;
  3459. half2 * tile_x_dm = nullptr;
  3460. int * tile_x_qh = nullptr;
  3461. int * tile_x_sc = nullptr;
  3462. allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
  3463. __shared__ int tile_y_qs[mmq_x * WARP_SIZE];
  3464. __shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
  3465. float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};
  3466. for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
  3467. load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
  3468. threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x);
  3469. #pragma unroll
  3470. for (int ir = 0; ir < qr; ++ir) {
  3471. const int kqs = ir*WARP_SIZE + threadIdx.x;
  3472. const int kbxd = kqs / QI8_1;
  3473. #pragma unroll
  3474. for (int i = 0; i < mmq_x; i += nwarps) {
  3475. const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
  3476. const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
  3477. const int index_y = (threadIdx.y + i) * WARP_SIZE + kqs % WARP_SIZE;
  3478. tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
  3479. }
  3480. #pragma unroll
  3481. for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
  3482. const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x;
  3483. const int kby = threadIdx.x % (WARP_SIZE/QI8_1);
  3484. const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
  3485. // if the sum is not needed it's faster to transform the scale to f32 ahead of time
  3486. const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE/QI8_1) + kby].ds;
  3487. half2 * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby];
  3488. if (need_sum) {
  3489. *dsi_dst = *dsi_src;
  3490. } else {
  3491. float * dfi_dst = (float *) dsi_dst;
  3492. *dfi_dst = __low2float(*dsi_src);
  3493. }
  3494. }
  3495. __syncthreads();
  3496. // #pragma unroll // unrolling this loop causes too much register pressure
  3497. for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) {
  3498. #pragma unroll
  3499. for (int j = 0; j < mmq_x; j += nwarps) {
  3500. #pragma unroll
  3501. for (int i = 0; i < mmq_y; i += WARP_SIZE) {
  3502. sum[i/WARP_SIZE][j/nwarps] += vec_dot(
  3503. tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
  3504. threadIdx.x + i, threadIdx.y + j, k);
  3505. }
  3506. }
  3507. }
  3508. __syncthreads();
  3509. }
  3510. }
  3511. #pragma unroll
  3512. for (int j = 0; j < mmq_x; j += nwarps) {
  3513. const int col_dst = col_dst_0 + j + threadIdx.y;
  3514. if (col_dst >= ncols_dst) {
  3515. return;
  3516. }
  3517. #pragma unroll
  3518. for (int i = 0; i < mmq_y; i += WARP_SIZE) {
  3519. const int row_dst = row_dst_0 + threadIdx.x + i;
  3520. if (row_dst >= nrows_dst) {
  3521. continue;
  3522. }
  3523. dst[col_dst*nrows_dst + row_dst] = __float2half(sum[i/WARP_SIZE][j/nwarps]);
  3524. }
  3525. }
  3526. }
  3527. #if defined(USE_ROCM)
  3528. #define MMQ_X_Q4_0 64
  3529. #define MMQ_Y_Q4_0 128
  3530. #define NWARPS_Q4_0 8
  3531. #else
  3532. #define MMQ_X_Q4_0 4
  3533. #define MMQ_Y_Q4_0 32
  3534. #define NWARPS_Q4_0 4
  3535. #endif
  3536. template <bool need_check> static __global__ void
  3537. #if defined(USE_ROCM)
  3538. __launch_bounds__(WARP_SIZE*NWARPS_Q4_0, 2)
  3539. #endif
  3540. mul_mat_q4_0(
  3541. const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
  3542. const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
  3543. const int mmq_x = MMQ_X_Q4_0;
  3544. const int mmq_y = MMQ_Y_Q4_0;
  3545. const int nwarps = NWARPS_Q4_0;
  3546. mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
  3547. load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
  3548. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3549. }
  3550. static void ggml_mul_mat_q4_0_q8_1_cuda(
  3551. const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
  3552. const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
  3553. int mmq_x = MMQ_X_Q4_0;
  3554. int mmq_y = MMQ_Y_Q4_0;
  3555. int nwarps = NWARPS_Q4_0;
  3556. const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
  3557. const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
  3558. const dim3 block_nums(block_num_x, block_num_y, 1);
  3559. const dim3 block_dims(WARP_SIZE, nwarps, 1);
  3560. if (nrows_x % mmq_y == 0) {
  3561. const bool need_check = false;
  3562. mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
  3563. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3564. } else {
  3565. const bool need_check = true;
  3566. mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
  3567. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3568. }
  3569. }
  3570. #if defined(USE_ROCM)
  3571. #define MMQ_X_Q4_1 64
  3572. #define MMQ_Y_Q4_1 128
  3573. #define NWARPS_Q4_1 8
  3574. #else
  3575. #define MMQ_X_Q4_1 4
  3576. #define MMQ_Y_Q4_1 32
  3577. #define NWARPS_Q4_1 4
  3578. #endif
  3579. template <bool need_check> static __global__ void
  3580. #if defined(USE_ROCM)
  3581. __launch_bounds__(WARP_SIZE*NWARPS_Q4_1, 2)
  3582. #endif
  3583. mul_mat_q4_1(
  3584. const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
  3585. const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
  3586. const int mmq_x = MMQ_X_Q4_1;
  3587. const int mmq_y = MMQ_Y_Q4_1;
  3588. const int nwarps = NWARPS_Q4_1;
  3589. mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
  3590. load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
  3591. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3592. }
  3593. static void ggml_mul_mat_q4_1_q8_1_cuda(
  3594. const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
  3595. const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
  3596. int mmq_x = MMQ_X_Q4_1;
  3597. int mmq_y = MMQ_Y_Q4_1;
  3598. int nwarps = NWARPS_Q4_1;
  3599. const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
  3600. const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
  3601. const dim3 block_nums(block_num_x, block_num_y, 1);
  3602. const dim3 block_dims(WARP_SIZE, nwarps, 1);
  3603. if (nrows_x % mmq_y == 0) {
  3604. const bool need_check = false;
  3605. mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
  3606. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3607. } else {
  3608. const bool need_check = true;
  3609. mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
  3610. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3611. }
  3612. }
  3613. #if defined(USE_ROCM)
  3614. #define MMQ_X_Q5_0 64
  3615. #define MMQ_Y_Q5_0 128
  3616. #define NWARPS_Q5_0 8
  3617. #else
  3618. #define MMQ_X_Q5_0 4
  3619. #define MMQ_Y_Q5_0 32
  3620. #define NWARPS_Q5_0 4
  3621. #endif
  3622. template <bool need_check> static __global__ void
  3623. #if defined(USE_ROCM)
  3624. __launch_bounds__(WARP_SIZE*NWARPS_Q5_0, 2)
  3625. #endif
  3626. mul_mat_q5_0(
  3627. const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
  3628. const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
  3629. const int mmq_x = MMQ_X_Q5_0;
  3630. const int mmq_y = MMQ_Y_Q5_0;
  3631. const int nwarps = NWARPS_Q5_0;
  3632. mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
  3633. load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
  3634. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3635. }
  3636. static void ggml_mul_mat_q5_0_q8_1_cuda(
  3637. const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
  3638. const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
  3639. const int mmq_x = MMQ_X_Q5_0;
  3640. const int mmq_y = MMQ_Y_Q5_0;
  3641. const int nwarps = NWARPS_Q5_0;
  3642. const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
  3643. const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
  3644. const dim3 block_nums(block_num_x, block_num_y, 1);
  3645. const dim3 block_dims(WARP_SIZE, nwarps, 1);
  3646. if (nrows_x % mmq_y == 0) {
  3647. const bool need_check = false;
  3648. mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
  3649. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3650. } else {
  3651. const bool need_check = true;
  3652. mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
  3653. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3654. }
  3655. }
  3656. #if defined(USE_ROCM)
  3657. #define MMQ_X_Q5_1 64
  3658. #define MMQ_Y_Q5_1 128
  3659. #define NWARPS_Q5_1 8
  3660. #else
  3661. #define MMQ_X_Q5_1 4
  3662. #define MMQ_Y_Q5_1 32
  3663. #define NWARPS_Q5_1 4
  3664. #endif
  3665. template <bool need_check> static __global__ void
  3666. #if defined(USE_ROCM)
  3667. __launch_bounds__(WARP_SIZE*NWARPS_Q5_1, 2)
  3668. #endif
  3669. mul_mat_q5_1(
  3670. const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
  3671. const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
  3672. const int mmq_x = MMQ_X_Q5_1;
  3673. const int mmq_y = MMQ_Y_Q5_1;
  3674. const int nwarps = NWARPS_Q5_1;
  3675. mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
  3676. load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
  3677. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3678. }
  3679. static void ggml_mul_mat_q5_1_q8_1_cuda(
  3680. const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
  3681. const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
  3682. const int mmq_x = MMQ_X_Q5_1;
  3683. const int mmq_y = MMQ_Y_Q5_1;
  3684. const int nwarps = NWARPS_Q5_1;
  3685. const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
  3686. const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
  3687. const dim3 block_nums(block_num_x, block_num_y, 1);
  3688. const dim3 block_dims(WARP_SIZE, nwarps, 1);
  3689. if (nrows_x % mmq_y == 0) {
  3690. const bool need_check = false;
  3691. mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
  3692. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3693. } else {
  3694. const bool need_check = true;
  3695. mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
  3696. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3697. }
  3698. }
  3699. #if defined(USE_ROCM)
  3700. #define MMQ_X_Q8_0 64
  3701. #define MMQ_Y_Q8_0 128
  3702. #define NWARPS_Q8_0 8
  3703. #else
  3704. #define MMQ_X_Q8_0 4
  3705. #define MMQ_Y_Q8_0 32
  3706. #define NWARPS_Q8_0 4
  3707. #endif
  3708. template <bool need_check> static __global__ void
  3709. #if defined(USE_ROCM)
  3710. __launch_bounds__(WARP_SIZE*NWARPS_Q8_0, 2)
  3711. #endif
  3712. mul_mat_q8_0(
  3713. const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
  3714. const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
  3715. const int mmq_x = MMQ_X_Q8_0;
  3716. const int mmq_y = MMQ_Y_Q8_0;
  3717. const int nwarps = NWARPS_Q8_0;
  3718. mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
  3719. load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
  3720. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3721. }
  3722. static void ggml_mul_mat_q8_0_q8_1_cuda(
  3723. const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
  3724. const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
  3725. const int mmq_x = MMQ_X_Q8_0;
  3726. const int mmq_y = MMQ_Y_Q8_0;
  3727. const int nwarps = NWARPS_Q8_0;
  3728. const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
  3729. const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
  3730. const dim3 block_nums(block_num_x, block_num_y, 1);
  3731. const dim3 block_dims(WARP_SIZE, nwarps, 1);
  3732. if (nrows_x % mmq_y == 0) {
  3733. const bool need_check = false;
  3734. mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
  3735. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3736. } else {
  3737. const bool need_check = true;
  3738. mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
  3739. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3740. }
  3741. }
  3742. #if defined(USE_ROCM)
  3743. #define MMQ_X_Q2_K 64
  3744. #define MMQ_Y_Q2_K 128
  3745. #define NWARPS_Q2_K 8
  3746. #else
  3747. #define MMQ_X_Q2_K 4
  3748. #define MMQ_Y_Q2_K 32
  3749. #define NWARPS_Q2_K 4
  3750. #endif
  3751. template <bool need_check> static __global__ void
  3752. #if defined(USE_ROCM)
  3753. __launch_bounds__(WARP_SIZE*NWARPS_Q2_K, 2)
  3754. #endif
  3755. mul_mat_q2_K(
  3756. const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
  3757. const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
  3758. const int mmq_x = MMQ_X_Q2_K;
  3759. const int mmq_y = MMQ_Y_Q2_K;
  3760. const int nwarps = NWARPS_Q2_K;
  3761. mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
  3762. load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
  3763. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3764. }
  3765. static void ggml_mul_mat_q2_K_q8_1_cuda(
  3766. const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
  3767. const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
  3768. const int mmq_x = MMQ_X_Q2_K;
  3769. const int mmq_y = MMQ_Y_Q2_K;
  3770. const int nwarps = NWARPS_Q2_K;
  3771. const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
  3772. const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
  3773. const dim3 block_nums(block_num_x, block_num_y, 1);
  3774. const dim3 block_dims(WARP_SIZE, nwarps, 1);
  3775. if (nrows_x % mmq_y == 0) {
  3776. const bool need_check = false;
  3777. mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
  3778. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3779. } else {
  3780. const bool need_check = true;
  3781. mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
  3782. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3783. }
  3784. }
  3785. #if defined(USE_ROCM)
  3786. #define MMQ_X_Q3_K 64
  3787. #define MMQ_Y_Q3_K 128
  3788. #define NWARPS_Q3_K 8
  3789. #else
  3790. #define MMQ_X_Q3_K 4
  3791. #define MMQ_Y_Q3_K 32
  3792. #define NWARPS_Q3_K 4
  3793. #endif
  3794. template <bool need_check> static __global__ void
  3795. #if defined(USE_ROCM)
  3796. __launch_bounds__(WARP_SIZE*NWARPS_Q3_K, 2)
  3797. #endif
  3798. mul_mat_q3_K(
  3799. const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
  3800. const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
  3801. const int mmq_x = MMQ_X_Q3_K;
  3802. const int mmq_y = MMQ_Y_Q3_K;
  3803. const int nwarps = NWARPS_Q3_K;
  3804. mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
  3805. load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
  3806. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3807. }
  3808. static void ggml_mul_mat_q3_K_q8_1_cuda(
  3809. const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
  3810. const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
  3811. const int mmq_x = MMQ_X_Q3_K;
  3812. const int mmq_y = MMQ_Y_Q3_K;
  3813. const int nwarps = NWARPS_Q3_K;
  3814. const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
  3815. const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
  3816. const dim3 block_nums(block_num_x, block_num_y, 1);
  3817. const dim3 block_dims(WARP_SIZE, nwarps, 1);
  3818. if (nrows_x % mmq_y == 0) {
  3819. const bool need_check = false;
  3820. mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
  3821. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3822. } else {
  3823. const bool need_check = true;
  3824. mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
  3825. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3826. }
  3827. }
  3828. #if defined(USE_ROCM)
  3829. #define MMQ_X_Q4_K 64
  3830. #define MMQ_Y_Q4_K 128
  3831. #define NWARPS_Q4_K 8
  3832. #else
  3833. #define MMQ_X_Q4_K 4
  3834. #define MMQ_Y_Q4_K 32
  3835. #define NWARPS_Q4_K 4
  3836. #endif
  3837. template <bool need_check> static __global__ void
  3838. #if defined(USE_ROCM)
  3839. __launch_bounds__(WARP_SIZE*NWARPS_Q4_K, 2)
  3840. #endif
  3841. mul_mat_q4_K(
  3842. const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
  3843. const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
  3844. const int mmq_x = MMQ_X_Q4_K;
  3845. const int mmq_y = MMQ_Y_Q4_K;
  3846. const int nwarps = NWARPS_Q4_K;
  3847. mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
  3848. load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
  3849. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3850. }
  3851. static void ggml_mul_mat_q4_K_q8_1_cuda(
  3852. const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
  3853. const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
  3854. const int mmq_x = MMQ_X_Q4_K;
  3855. const int mmq_y = MMQ_Y_Q4_K;
  3856. const int nwarps = NWARPS_Q4_K;
  3857. const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
  3858. const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
  3859. const dim3 block_nums(block_num_x, block_num_y, 1);
  3860. const dim3 block_dims(WARP_SIZE, nwarps, 1);
  3861. if (nrows_x % mmq_y == 0) {
  3862. const bool need_check = false;
  3863. mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
  3864. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3865. } else {
  3866. const bool need_check = true;
  3867. mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
  3868. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3869. }
  3870. }
  3871. #if defined(USE_ROCM)
  3872. #define MMQ_X_Q5_K 64
  3873. #define MMQ_Y_Q5_K 128
  3874. #define NWARPS_Q5_K 8
  3875. #else
  3876. #define MMQ_X_Q5_K 4
  3877. #define MMQ_Y_Q5_K 32
  3878. #define NWARPS_Q5_K 4
  3879. #endif
  3880. template <bool need_check> static __global__ void
  3881. #if defined(USE_ROCM)
  3882. __launch_bounds__(WARP_SIZE*NWARPS_Q5_K, 2)
  3883. #endif
  3884. mul_mat_q5_K(
  3885. const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
  3886. const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
  3887. const int mmq_x = MMQ_X_Q5_K;
  3888. const int mmq_y = MMQ_Y_Q5_K;
  3889. const int nwarps = NWARPS_Q5_K;
  3890. mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
  3891. load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
  3892. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3893. }
  3894. static void ggml_mul_mat_q5_K_q8_1_cuda(
  3895. const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
  3896. const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
  3897. const int mmq_x = MMQ_X_Q5_K;
  3898. const int mmq_y = MMQ_Y_Q5_K;
  3899. const int nwarps = NWARPS_Q5_K;
  3900. const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
  3901. const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
  3902. const dim3 block_nums(block_num_x, block_num_y, 1);
  3903. const dim3 block_dims(WARP_SIZE, nwarps, 1);
  3904. if (nrows_x % mmq_y == 0) {
  3905. const bool need_check = false;
  3906. mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
  3907. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3908. } else {
  3909. const bool need_check = true;
  3910. mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
  3911. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3912. }
  3913. }
  3914. #if defined(USE_ROCM)
  3915. #define MMQ_X_Q6_K 64
  3916. #define MMQ_Y_Q6_K 128
  3917. #define NWARPS_Q6_K 8
  3918. #else
  3919. #define MMQ_X_Q6_K 4
  3920. #define MMQ_Y_Q6_K 32
  3921. #define NWARPS_Q6_K 4
  3922. #endif
  3923. template <bool need_check> static __global__ void
  3924. #if defined(USE_ROCM)
  3925. __launch_bounds__(WARP_SIZE*NWARPS_Q6_K, 2)
  3926. #endif
  3927. mul_mat_q6_K(
  3928. const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
  3929. const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
  3930. const int mmq_x = MMQ_X_Q6_K;
  3931. const int mmq_y = MMQ_Y_Q6_K;
  3932. const int nwarps = NWARPS_Q6_K;
  3933. mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
  3934. load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
  3935. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3936. }
  3937. static void ggml_mul_mat_q6_K_q8_1_cuda(
  3938. const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
  3939. const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
  3940. const int mmq_x = MMQ_X_Q6_K;
  3941. const int mmq_y = MMQ_Y_Q6_K;
  3942. const int nwarps = NWARPS_Q6_K;
  3943. const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
  3944. const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
  3945. const dim3 block_nums(block_num_x, block_num_y, 1);
  3946. const dim3 block_dims(WARP_SIZE, nwarps, 1);
  3947. if (nrows_x % mmq_y == 0) {
  3948. const bool need_check = false;
  3949. mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
  3950. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3951. } else {
  3952. const bool need_check = true;
  3953. mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
  3954. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3955. }
  3956. }
  3957. torch::Tensor ggml_dequantize(
  3958. torch::Tensor W, // quant weight
  3959. int8_t type,
  3960. int64_t m,
  3961. int64_t n
  3962. ){
  3963. const at::cuda::OptionalCUDAGuard device_guard(device_of(W));
  3964. auto options = torch::TensorOptions().dtype(torch::kFloat16).device(W.device());
  3965. at::Tensor DW = torch::empty({m, n}, options);
  3966. cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
  3967. const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(type);
  3968. to_fp16_cuda(
  3969. (void*)W.data_ptr(), (half*)DW.data_ptr(), m * n, stream
  3970. );
  3971. return DW;
  3972. }
  3973. // New quantizations doesn't implement ggml_mul_mat_vec and only use ggml_mul_mat_vec_a8
  3974. torch::Tensor ggml_mul_mat_vec(
  3975. torch::Tensor W, // quant weight
  3976. torch::Tensor X, // input
  3977. int8_t type,
  3978. int64_t row
  3979. ){
  3980. size_t col = X.sizes()[1];
  3981. const at::cuda::OptionalCUDAGuard device_guard(device_of(X));
  3982. auto options = torch::TensorOptions().dtype(torch::kFloat16).device(W.device());
  3983. at::Tensor Y = torch::empty({1, row}, options);
  3984. cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
  3985. switch (type) {
  3986. case 2:
  3987. dequantize_mul_mat_vec_q4_0_cuda((void*)W.data_ptr(), (half*)X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  3988. break;
  3989. case 3:
  3990. dequantize_mul_mat_vec_q4_1_cuda((void*)W.data_ptr(), (half*)X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  3991. break;
  3992. case 6:
  3993. dequantize_mul_mat_vec_q5_0_cuda((void*)W.data_ptr(), (half*)X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  3994. break;
  3995. case 7:
  3996. dequantize_mul_mat_vec_q5_1_cuda((void*)W.data_ptr(), (half*)X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  3997. break;
  3998. case 8:
  3999. dequantize_mul_mat_vec_q8_0_cuda((void*)W.data_ptr(), (half*)X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  4000. break;
  4001. case 10:
  4002. dequantize_mul_mat_vec_q2_K_cuda((void*)W.data_ptr(), (half*)X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  4003. break;
  4004. case 11:
  4005. dequantize_mul_mat_vec_q3_K_cuda((void*)W.data_ptr(), (half*)X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  4006. break;
  4007. case 12:
  4008. dequantize_mul_mat_vec_q4_K_cuda((void*)W.data_ptr(), (half*)X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  4009. break;
  4010. case 13:
  4011. dequantize_mul_mat_vec_q5_K_cuda((void*)W.data_ptr(), (half*)X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  4012. break;
  4013. case 14:
  4014. dequantize_mul_mat_vec_q6_K_cuda((void*)W.data_ptr(), (half*)X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  4015. break;
  4016. case 16:
  4017. dequantize_mul_mat_vec_iq2_xxs_cuda((void*)W.data_ptr(), (half*)X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  4018. break;
  4019. case 17:
  4020. dequantize_mul_mat_vec_iq2_xs_cuda((void*)W.data_ptr(), (half*)X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  4021. break;
  4022. }
  4023. return Y;
  4024. }
  4025. torch::Tensor ggml_mul_mat_vec_a8(
  4026. torch::Tensor W, // quant weight
  4027. torch::Tensor X, // input
  4028. int8_t type,
  4029. int64_t row
  4030. ){
  4031. int col = X.sizes()[1];
  4032. const int padded = (col + 512 - 1) / 512 * 512;
  4033. const at::cuda::OptionalCUDAGuard device_guard(device_of(X));
  4034. auto options = torch::TensorOptions().dtype(torch::kFloat16).device(W.device());
  4035. at::Tensor Y = torch::empty({1, row}, options);
  4036. cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
  4037. options = torch::TensorOptions().dtype(torch::kInt32).device(W.device());
  4038. at::Tensor quant_X = torch::empty({1, padded / 32 * 9}, options);
  4039. quantize_row_q8_1_cuda((half*)X.data_ptr(), (void*)quant_X.data_ptr(), col, 1, stream);
  4040. switch (type) {
  4041. case 2:
  4042. mul_mat_vec_q4_0_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  4043. break;
  4044. case 3:
  4045. mul_mat_vec_q4_1_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  4046. break;
  4047. case 6:
  4048. mul_mat_vec_q5_0_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  4049. break;
  4050. case 7:
  4051. mul_mat_vec_q5_1_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  4052. break;
  4053. case 8:
  4054. mul_mat_vec_q8_0_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  4055. break;
  4056. case 10:
  4057. mul_mat_vec_q2_K_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  4058. break;
  4059. case 11:
  4060. mul_mat_vec_q3_K_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  4061. break;
  4062. case 12:
  4063. mul_mat_vec_q4_K_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  4064. break;
  4065. case 13:
  4066. mul_mat_vec_q5_K_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  4067. break;
  4068. case 14:
  4069. mul_mat_vec_q6_K_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  4070. break;
  4071. case 16:
  4072. mul_mat_vec_iq2_xxs_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  4073. break;
  4074. case 17:
  4075. mul_mat_vec_iq2_xs_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  4076. break;
  4077. case 18:
  4078. mul_mat_vec_iq3_xxs_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  4079. break;
  4080. case 19:
  4081. mul_mat_vec_iq1_s_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  4082. break;
  4083. case 20:
  4084. mul_mat_vec_iq4_nl_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  4085. break;
  4086. case 21:
  4087. mul_mat_vec_iq3_s_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  4088. break;
  4089. case 22:
  4090. mul_mat_vec_iq2_s_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  4091. break;
  4092. case 23:
  4093. mul_mat_vec_iq4_xs_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  4094. break;
  4095. }
  4096. return Y;
  4097. }
  4098. torch::Tensor ggml_mul_mat_a8(
  4099. torch::Tensor W, // quant weight
  4100. torch::Tensor X, // input
  4101. int8_t type,
  4102. int64_t row
  4103. ) {
  4104. int col = X.sizes()[1];
  4105. int padded = (col + 512 - 1) / 512 * 512;
  4106. int batch = X.sizes()[0];
  4107. const at::cuda::OptionalCUDAGuard device_guard(device_of(X));
  4108. auto options = torch::TensorOptions().dtype(torch::kFloat16).device(W.device());
  4109. at::Tensor Y = torch::empty({batch, row}, options);
  4110. cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
  4111. options = torch::TensorOptions().dtype(torch::kInt32).device(W.device());
  4112. at::Tensor quant_X = torch::empty({batch, padded / 32 * 9}, options);
  4113. quantize_row_q8_1_cuda((half*)X.data_ptr(), (void*)quant_X.data_ptr(), col, batch, stream);
  4114. switch (type) {
  4115. case 2:
  4116. ggml_mul_mat_q4_0_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, batch, padded, row, stream);
  4117. break;
  4118. case 3:
  4119. ggml_mul_mat_q4_1_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, batch, padded, row, stream);
  4120. break;
  4121. case 6:
  4122. ggml_mul_mat_q5_0_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, batch, padded, row, stream);
  4123. break;
  4124. case 7:
  4125. ggml_mul_mat_q5_1_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, batch, padded, row, stream);
  4126. break;
  4127. case 8:
  4128. ggml_mul_mat_q8_0_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, batch, padded, row, stream);
  4129. break;
  4130. case 10:
  4131. ggml_mul_mat_q2_K_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, batch, padded, row, stream);
  4132. break;
  4133. case 11:
  4134. ggml_mul_mat_q3_K_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, batch, padded, row, stream);
  4135. break;
  4136. case 12:
  4137. ggml_mul_mat_q4_K_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, batch, padded, row, stream);
  4138. break;
  4139. case 13:
  4140. ggml_mul_mat_q5_K_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, batch, padded, row, stream);
  4141. break;
  4142. case 14:
  4143. ggml_mul_mat_q6_K_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, batch, padded, row, stream);
  4144. break;
  4145. }
  4146. return Y;
  4147. }