gguf_kernel.cu 221 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900
  1. #include <cuda_fp16.h>
  2. #include <cuda_runtime.h>
  3. #include <torch/all.h>
  4. #include <torch/python.h>
  5. #include <c10/cuda/CUDAGuard.h>
  6. #define QK_K 256
  7. #define K_QUANTS_PER_ITERATION 2
  8. #define WARP_SIZE 32
  9. #define K_SCALE_SIZE 12
  10. #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
  11. #define CUDA_QUANTIZE_BLOCK_SIZE 256
  12. #define GGML_CUDA_DMMV_X 32
  13. #define GGML_CUDA_MMV_Y 1
  14. // Data Structures
  15. // QK = number of values after dequantization
  16. // QR = QK / number of values before dequantization
  17. // QI = number of 32 bit integers before dequantization
  18. #define QK4_0 32
  19. #define QR4_0 2
  20. #define QI4_0 (QK4_0 / (4 * QR4_0))
  21. typedef struct {
  22. half d; // delta
  23. uint8_t qs[QK4_0 / 2]; // nibbles / quants
  24. } block_q4_0;
  25. #define QK4_1 32
  26. #define QR4_1 2
  27. #define QI4_1 (QK4_1 / (4 * QR4_1))
  28. typedef struct {
  29. half2 dm; // dm.x = delta, dm.y = min
  30. uint8_t qs[QK4_1 / 2]; // nibbles / quants
  31. } block_q4_1;
  32. #define QK5_0 32
  33. #define QR5_0 2
  34. #define QI5_0 (QK5_0 / (4 * QR5_0))
  35. typedef struct {
  36. half d; // delta
  37. uint8_t qh[4]; // 5-th bit of quants
  38. uint8_t qs[QK5_0 / 2]; // nibbles / quants
  39. } block_q5_0;
  40. #define QK5_1 32
  41. #define QR5_1 2
  42. #define QI5_1 (QK5_1 / (4 * QR5_1))
  43. typedef struct {
  44. half2 dm; // dm.x = delta, dm.y = min
  45. uint8_t qh[4]; // 5-th bit of quants
  46. uint8_t qs[QK5_1 / 2]; // nibbles / quants
  47. } block_q5_1;
  48. #define QK8_0 32
  49. #define QR8_0 1
  50. #define QI8_0 (QK8_0 / (4 * QR8_0))
  51. typedef struct {
  52. half d; // delta
  53. int8_t qs[QK8_0]; // quants
  54. } block_q8_0;
  55. #define QK8_1 32
  56. #define QR8_1 1
  57. #define QI8_1 (QK8_1 / (4 * QR8_1))
  58. typedef struct {
  59. half2 ds; // ds.x = delta, ds.y = sum
  60. int8_t qs[QK8_0]; // quants
  61. } block_q8_1;
  62. #define QR2_K 4
  63. #define QI2_K (QK_K / (4*QR2_K))
  64. typedef struct {
  65. uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
  66. uint8_t qs[QK_K/4]; // quants
  67. half2 dm; // super-block scale for quantized scales/mins
  68. } block_q2_K;
  69. #define QR3_K 4
  70. #define QI3_K (QK_K / (4*QR3_K))
  71. typedef struct {
  72. uint8_t hmask[QK_K/8]; // quants - high bit
  73. uint8_t qs[QK_K/4]; // quants - low 2 bits
  74. uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
  75. half d; // super-block scale
  76. } block_q3_K;
  77. #define QR4_K 2
  78. #define QI4_K (QK_K / (4*QR4_K))
  79. typedef struct {
  80. half2 dm; // super-block scale for quantized scales/mins
  81. uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
  82. uint8_t qs[QK_K/2]; // 4--bit quants
  83. } block_q4_K;
  84. #define QR5_K 2
  85. #define QI5_K (QK_K / (4*QR5_K))
  86. typedef struct {
  87. half2 dm; // super-block scale for quantized scales/mins
  88. uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
  89. uint8_t qh[QK_K/8]; // quants, high bit
  90. uint8_t qs[QK_K/2]; // quants, low 4 bits
  91. } block_q5_K;
  92. #define QR6_K 2
  93. #define QI6_K (QK_K / (4*QR6_K))
  94. typedef struct {
  95. uint8_t ql[QK_K/2]; // quants, lower 4 bits
  96. uint8_t qh[QK_K/4]; // quants, upper 2 bits
  97. int8_t scales[QK_K/16]; // scales
  98. half d; // delta
  99. } block_q6_K;
  100. #define QR2_XXS 8
  101. #define QI2_XXS (QK_K / (4*QR2_XXS))
  102. typedef struct {
  103. half d;
  104. uint16_t qs[QK_K/8];
  105. } block_iq2_xxs;
  106. #define QR2_XS 8
  107. #define QI2_XS (QK_K / (4*QR2_XS))
  108. typedef struct {
  109. half d;
  110. uint16_t qs[QK_K/8];
  111. uint8_t scales[QK_K/32];
  112. } block_iq2_xs;
  113. #define QR2_S 8
  114. #define QI2_S (QK_K / (4*QR2_S))
  115. typedef struct {
  116. half d;
  117. uint8_t qs[QK_K/4];
  118. uint8_t qh[QK_K/32];
  119. uint8_t scales[QK_K/32];
  120. } block_iq2_s;
  121. #define QR3_XXS 8
  122. #define QI3_XXS (QK_K / (4*QR3_XXS))
  123. typedef struct {
  124. half d;
  125. uint8_t qs[3*(QK_K/8)];
  126. } block_iq3_xxs;
  127. #define QR3_XS 8
  128. #define QI3_XS (QK_K / (4*QR3_XS))
  129. #define IQ3S_N_SCALE QK_K/64
  130. typedef struct {
  131. half d;
  132. uint8_t qs[QK_K/4];
  133. uint8_t qh[QK_K/32];
  134. uint8_t signs[QK_K/8];
  135. uint8_t scales[IQ3S_N_SCALE];
  136. } block_iq3_s;
  137. #define QR1_S 8
  138. #define QI1_S (QK_K / (4*QR1_S))
  139. typedef struct {
  140. half d;
  141. uint8_t qs[QK_K/8];
  142. uint8_t scales[QK_K/16];
  143. } block_iq1_s;
  144. #define QK4_NL 32
  145. #define QR4_NL 2
  146. #define QI4_NL (QK4_NL / (4*QR4_NL))
  147. typedef struct {
  148. half d;
  149. uint8_t qs[QK4_NL/2];
  150. } block_iq4_nl;
  151. #define QR4_XS 8
  152. #define QI4_XS (QK_K / (4*QR4_XS))
  153. typedef struct {
  154. half d;
  155. uint16_t scales_h;
  156. uint8_t scales_l[QK_K/64];
  157. uint8_t qs[QK_K/2];
  158. } block_iq4_xs;
  159. static const __device__ uint64_t iq2xxs_grid[256] = {
  160. 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
  161. 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808,
  162. 0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819,
  163. 0x0808080819081908, 0x0808080819190808, 0x0808080819192b08, 0x08080808192b0819,
  164. 0x08080808192b1908, 0x080808082b080808, 0x080808082b08082b, 0x080808082b082b2b,
  165. 0x080808082b2b082b, 0x0808081908080819, 0x0808081908081908, 0x0808081908190808,
  166. 0x0808081908191919, 0x0808081919080808, 0x080808192b081908, 0x080808192b192b08,
  167. 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b082b082b, 0x0808082b2b08082b,
  168. 0x0808190808080819, 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819,
  169. 0x08081908082b1908, 0x0808190819080808, 0x080819081908082b, 0x0808190819082b08,
  170. 0x08081908192b0808, 0x080819082b080819, 0x080819082b081908, 0x080819082b190808,
  171. 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08,
  172. 0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19, 0x080819192b080808,
  173. 0x080819192b190819, 0x0808192b08082b19, 0x0808192b08190808, 0x0808192b19080808,
  174. 0x0808192b2b081908, 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919,
  175. 0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08, 0x08082b0819080819,
  176. 0x08082b0819081908, 0x08082b0819190808, 0x08082b081919082b, 0x08082b082b082b08,
  177. 0x08082b1908081908, 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908,
  178. 0x0819080808080819, 0x0819080808081908, 0x0819080808190808, 0x08190808082b0819,
  179. 0x0819080819080808, 0x08190808192b0808, 0x081908082b081908, 0x081908082b190808,
  180. 0x081908082b191919, 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808,
  181. 0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808, 0x0819082b082b1908,
  182. 0x0819082b19081919, 0x0819190808080808, 0x0819190808082b08, 0x08191908082b0808,
  183. 0x08191908082b1919, 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08,
  184. 0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b, 0x08192b0808080819,
  185. 0x08192b0808081908, 0x08192b0808190808, 0x08192b0819080808, 0x08192b082b080819,
  186. 0x08192b1908080808, 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819,
  187. 0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b, 0x082b080819081908,
  188. 0x082b0808192b0819, 0x082b08082b080808, 0x082b08082b08082b, 0x082b0819082b2b19,
  189. 0x082b081919082b08, 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819,
  190. 0x082b190808081908, 0x082b190808190808, 0x082b190819080808, 0x082b19081919192b,
  191. 0x082b191908080808, 0x082b191919080819, 0x082b1919192b1908, 0x082b192b2b190808,
  192. 0x082b2b0808082b08, 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908,
  193. 0x1908080808080819, 0x1908080808081908, 0x1908080808190808, 0x1908080808192b08,
  194. 0x19080808082b0819, 0x19080808082b1908, 0x1908080819080808, 0x1908080819082b08,
  195. 0x190808081919192b, 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908,
  196. 0x190808082b190808, 0x1908081908080808, 0x19080819082b0808, 0x19080819192b0819,
  197. 0x190808192b080808, 0x190808192b081919, 0x1908082b08080819, 0x1908082b08190808,
  198. 0x1908082b19082b08, 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808,
  199. 0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808, 0x190819082b192b19,
  200. 0x190819190819082b, 0x19081919082b1908, 0x1908192b08080808, 0x19082b0808080819,
  201. 0x19082b0808081908, 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919,
  202. 0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819, 0x19082b192b08082b,
  203. 0x19082b2b19081919, 0x19082b2b2b190808, 0x1919080808080808, 0x1919080808082b08,
  204. 0x1919080808190819, 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808,
  205. 0x191908082b082b08, 0x1919081908081908, 0x191908191908082b, 0x191908192b2b1908,
  206. 0x1919082b2b190819, 0x191919082b190808, 0x191919082b19082b, 0x1919191908082b2b,
  207. 0x1919192b08080819, 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819,
  208. 0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808, 0x19192b2b08082b08,
  209. 0x192b080808081908, 0x192b080808190808, 0x192b080819080808, 0x192b0808192b2b08,
  210. 0x192b081908080808, 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808,
  211. 0x192b190808080808, 0x192b190808081919, 0x192b191908190808, 0x192b19190819082b,
  212. 0x192b19192b081908, 0x192b2b081908082b, 0x2b08080808080808, 0x2b0808080808082b,
  213. 0x2b08080808082b2b, 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908,
  214. 0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819, 0x2b08190808080819,
  215. 0x2b08190808081908, 0x2b08190808190808, 0x2b08190808191919, 0x2b08190819080808,
  216. 0x2b081908192b0808, 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908,
  217. 0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808, 0x2b082b080808082b,
  218. 0x2b082b1908081908, 0x2b082b2b08190819, 0x2b19080808081908, 0x2b19080808190808,
  219. 0x2b190808082b1908, 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b,
  220. 0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808, 0x2b191908082b082b,
  221. 0x2b19190819081908, 0x2b19191919190819, 0x2b192b082b080819, 0x2b192b19082b0808,
  222. 0x2b2b08080808082b, 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19,
  223. 0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908,
  224. };
  225. static const __device__ uint64_t iq2xs_grid[512] = {
  226. 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
  227. 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
  228. 0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
  229. 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
  230. 0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
  231. 0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808,
  232. 0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819,
  233. 0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819,
  234. 0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808,
  235. 0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b,
  236. 0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b,
  237. 0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908,
  238. 0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908,
  239. 0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919,
  240. 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808,
  241. 0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919,
  242. 0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908,
  243. 0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b,
  244. 0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908,
  245. 0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08,
  246. 0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808,
  247. 0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808,
  248. 0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819,
  249. 0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908,
  250. 0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819,
  251. 0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808,
  252. 0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b,
  253. 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819,
  254. 0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819,
  255. 0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808,
  256. 0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908,
  257. 0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19,
  258. 0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b,
  259. 0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b,
  260. 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919,
  261. 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808,
  262. 0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819,
  263. 0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819,
  264. 0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b,
  265. 0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908,
  266. 0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808,
  267. 0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819,
  268. 0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808,
  269. 0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919,
  270. 0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808,
  271. 0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808,
  272. 0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908,
  273. 0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908,
  274. 0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808,
  275. 0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b,
  276. 0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819,
  277. 0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919,
  278. 0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908,
  279. 0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808,
  280. 0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908,
  281. 0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919,
  282. 0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08,
  283. 0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19,
  284. 0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b,
  285. 0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b,
  286. 0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808,
  287. 0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08,
  288. 0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b,
  289. 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908,
  290. 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b,
  291. 0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908,
  292. 0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08,
  293. 0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808,
  294. 0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808,
  295. 0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08,
  296. 0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819,
  297. 0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919,
  298. 0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808,
  299. 0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808,
  300. 0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819,
  301. 0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819,
  302. 0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908,
  303. 0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908,
  304. 0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b,
  305. 0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908,
  306. 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908,
  307. 0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908,
  308. 0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808,
  309. 0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819,
  310. 0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819,
  311. 0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819,
  312. 0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808,
  313. 0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b,
  314. 0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819,
  315. 0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819,
  316. 0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08,
  317. 0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808,
  318. 0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19,
  319. 0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919,
  320. 0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808,
  321. 0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19,
  322. 0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b,
  323. 0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808,
  324. 0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b,
  325. 0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b,
  326. 0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08,
  327. 0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b,
  328. 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808,
  329. 0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819,
  330. 0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808,
  331. 0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808,
  332. 0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08,
  333. 0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b,
  334. 0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19,
  335. 0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08,
  336. 0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919,
  337. 0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08,
  338. 0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08,
  339. 0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908,
  340. 0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908,
  341. 0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b,
  342. 0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908,
  343. 0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808,
  344. 0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b,
  345. 0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808,
  346. 0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808,
  347. 0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19,
  348. 0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08,
  349. 0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808,
  350. 0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b,
  351. 0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808,
  352. 0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b,
  353. 0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
  354. };
  355. static const __device__ uint64_t iq2s_grid[1024] = {
  356. 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
  357. 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
  358. 0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
  359. 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
  360. 0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
  361. 0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x08080808192b192b,
  362. 0x08080808192b2b19, 0x080808082b080808, 0x080808082b08082b, 0x080808082b081919,
  363. 0x080808082b082b08, 0x080808082b190819, 0x080808082b191908, 0x080808082b2b0808,
  364. 0x080808082b2b1919, 0x080808082b2b2b2b, 0x0808081908080819, 0x0808081908081908,
  365. 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, 0x080808190819082b,
  366. 0x0808081908191919, 0x0808081908192b08, 0x08080819082b0819, 0x08080819082b1908,
  367. 0x0808081919080808, 0x080808191908082b, 0x0808081919081919, 0x0808081919082b08,
  368. 0x0808081919190819, 0x0808081919191908, 0x080808191919192b, 0x0808081919192b19,
  369. 0x08080819192b0808, 0x08080819192b1919, 0x08080819192b2b08, 0x080808192b080819,
  370. 0x080808192b081908, 0x080808192b190808, 0x080808192b19082b, 0x080808192b191919,
  371. 0x080808192b2b0819, 0x080808192b2b1908, 0x0808082b08080808, 0x0808082b0808082b,
  372. 0x0808082b08081919, 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908,
  373. 0x0808082b082b0808, 0x0808082b082b2b2b, 0x0808082b19080819, 0x0808082b19081908,
  374. 0x0808082b1908192b, 0x0808082b19082b19, 0x0808082b19190808, 0x0808082b19191919,
  375. 0x0808082b2b080808, 0x0808082b2b081919, 0x0808082b2b082b2b, 0x0808082b2b191908,
  376. 0x0808082b2b2b082b, 0x0808190808080819, 0x0808190808081908, 0x080819080808192b,
  377. 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, 0x0808190808191919,
  378. 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, 0x08081908082b192b,
  379. 0x08081908082b2b19, 0x0808190819080808, 0x080819081908082b, 0x0808190819081919,
  380. 0x0808190819082b08, 0x0808190819082b2b, 0x0808190819190819, 0x0808190819191908,
  381. 0x080819081919192b, 0x0808190819192b19, 0x08081908192b0808, 0x08081908192b082b,
  382. 0x08081908192b1919, 0x080819082b080819, 0x080819082b081908, 0x080819082b08192b,
  383. 0x080819082b082b19, 0x080819082b190808, 0x080819082b191919, 0x080819082b192b08,
  384. 0x080819082b2b0819, 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b,
  385. 0x0808191908081919, 0x0808191908082b08, 0x0808191908082b2b, 0x0808191908190819,
  386. 0x0808191908191908, 0x080819190819192b, 0x0808191908192b19, 0x08081919082b0808,
  387. 0x08081919082b1919, 0x08081919082b2b08, 0x0808191919080819, 0x0808191919081908,
  388. 0x080819191908192b, 0x0808191919082b19, 0x0808191919190808, 0x080819191919082b,
  389. 0x0808191919191919, 0x0808191919192b08, 0x08081919192b0819, 0x08081919192b1908,
  390. 0x080819192b080808, 0x080819192b08082b, 0x080819192b081919, 0x080819192b082b08,
  391. 0x080819192b190819, 0x080819192b191908, 0x080819192b2b0808, 0x0808192b08080819,
  392. 0x0808192b08081908, 0x0808192b0808192b, 0x0808192b08082b19, 0x0808192b08190808,
  393. 0x0808192b08191919, 0x0808192b19080808, 0x0808192b19081919, 0x0808192b19082b08,
  394. 0x0808192b19190819, 0x0808192b19191908, 0x0808192b192b0808, 0x0808192b2b080819,
  395. 0x0808192b2b081908, 0x0808192b2b190808, 0x08082b0808080808, 0x08082b080808082b,
  396. 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808190819, 0x08082b0808191908,
  397. 0x08082b080819192b, 0x08082b0808192b19, 0x08082b08082b0808, 0x08082b08082b1919,
  398. 0x08082b08082b2b2b, 0x08082b0819080819, 0x08082b0819081908, 0x08082b081908192b,
  399. 0x08082b0819082b19, 0x08082b0819190808, 0x08082b081919082b, 0x08082b0819191919,
  400. 0x08082b0819192b08, 0x08082b08192b0819, 0x08082b08192b1908, 0x08082b082b080808,
  401. 0x08082b082b081919, 0x08082b082b191908, 0x08082b082b2b2b2b, 0x08082b1908080819,
  402. 0x08082b1908081908, 0x08082b1908190808, 0x08082b190819082b, 0x08082b1908191919,
  403. 0x08082b1908192b08, 0x08082b19082b0819, 0x08082b1919080808, 0x08082b1919081919,
  404. 0x08082b1919082b08, 0x08082b1919190819, 0x08082b1919191908, 0x08082b19192b0808,
  405. 0x08082b192b080819, 0x08082b192b190808, 0x08082b2b08080808, 0x08082b2b08190819,
  406. 0x08082b2b08191908, 0x08082b2b082b082b, 0x08082b2b082b2b08, 0x08082b2b082b2b2b,
  407. 0x08082b2b19190808, 0x08082b2b2b192b19, 0x0819080808080819, 0x0819080808081908,
  408. 0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b,
  409. 0x0819080808191919, 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908,
  410. 0x08190808082b192b, 0x0819080819080808, 0x081908081908082b, 0x0819080819081919,
  411. 0x0819080819082b08, 0x0819080819190819, 0x0819080819191908, 0x081908081919192b,
  412. 0x0819080819192b19, 0x08190808192b0808, 0x08190808192b082b, 0x08190808192b1919,
  413. 0x08190808192b2b08, 0x081908082b080819, 0x081908082b081908, 0x081908082b08192b,
  414. 0x081908082b190808, 0x081908082b191919, 0x081908082b192b08, 0x081908082b2b0819,
  415. 0x081908082b2b1908, 0x0819081908080808, 0x081908190808082b, 0x0819081908081919,
  416. 0x0819081908082b08, 0x0819081908082b2b, 0x0819081908190819, 0x0819081908191908,
  417. 0x081908190819192b, 0x0819081908192b19, 0x08190819082b0808, 0x08190819082b082b,
  418. 0x08190819082b1919, 0x08190819082b2b08, 0x0819081919080819, 0x0819081919081908,
  419. 0x081908191908192b, 0x0819081919082b19, 0x0819081919190808, 0x081908191919082b,
  420. 0x0819081919191919, 0x0819081919192b08, 0x08190819192b0819, 0x08190819192b1908,
  421. 0x081908192b080808, 0x081908192b08082b, 0x081908192b081919, 0x081908192b082b08,
  422. 0x081908192b190819, 0x081908192b191908, 0x0819082b08080819, 0x0819082b08081908,
  423. 0x0819082b08082b19, 0x0819082b08190808, 0x0819082b08191919, 0x0819082b082b0819,
  424. 0x0819082b082b1908, 0x0819082b19080808, 0x0819082b19081919, 0x0819082b19190819,
  425. 0x0819082b19191908, 0x0819082b2b080819, 0x0819082b2b081908, 0x0819082b2b190808,
  426. 0x0819190808080808, 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08,
  427. 0x0819190808190819, 0x0819190808191908, 0x081919080819192b, 0x0819190808192b19,
  428. 0x08191908082b0808, 0x08191908082b1919, 0x08191908082b2b08, 0x0819190819080819,
  429. 0x0819190819081908, 0x081919081908192b, 0x0819190819082b19, 0x0819190819190808,
  430. 0x081919081919082b, 0x0819190819191919, 0x0819190819192b08, 0x08191908192b0819,
  431. 0x08191908192b1908, 0x081919082b080808, 0x081919082b08082b, 0x081919082b081919,
  432. 0x081919082b082b08, 0x081919082b190819, 0x081919082b191908, 0x081919082b2b0808,
  433. 0x0819191908080819, 0x0819191908081908, 0x081919190808192b, 0x0819191908082b19,
  434. 0x0819191908190808, 0x081919190819082b, 0x0819191908191919, 0x0819191908192b08,
  435. 0x08191919082b0819, 0x08191919082b1908, 0x0819191919080808, 0x081919191908082b,
  436. 0x0819191919081919, 0x0819191919082b08, 0x0819191919190819, 0x0819191919191908,
  437. 0x08191919192b0808, 0x081919192b080819, 0x081919192b081908, 0x081919192b190808,
  438. 0x0819192b08080808, 0x0819192b08081919, 0x0819192b08082b08, 0x0819192b08190819,
  439. 0x0819192b08191908, 0x0819192b082b0808, 0x0819192b19080819, 0x0819192b19081908,
  440. 0x0819192b19190808, 0x0819192b2b080808, 0x0819192b2b2b2b2b, 0x08192b0808080819,
  441. 0x08192b0808081908, 0x08192b080808192b, 0x08192b0808082b19, 0x08192b0808190808,
  442. 0x08192b0808191919, 0x08192b0808192b08, 0x08192b08082b0819, 0x08192b0819080808,
  443. 0x08192b081908082b, 0x08192b0819081919, 0x08192b0819082b08, 0x08192b0819190819,
  444. 0x08192b0819191908, 0x08192b08192b0808, 0x08192b082b080819, 0x08192b082b081908,
  445. 0x08192b1908080808, 0x08192b190808082b, 0x08192b1908081919, 0x08192b1908082b08,
  446. 0x08192b1908190819, 0x08192b1908191908, 0x08192b19082b0808, 0x08192b1919080819,
  447. 0x08192b1919081908, 0x08192b1919190808, 0x08192b19192b2b19, 0x08192b192b2b082b,
  448. 0x08192b2b08081908, 0x08192b2b08190808, 0x08192b2b19080808, 0x08192b2b1919192b,
  449. 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, 0x082b080808082b08,
  450. 0x082b080808190819, 0x082b080808191908, 0x082b08080819192b, 0x082b080808192b19,
  451. 0x082b0808082b0808, 0x082b0808082b1919, 0x082b0808082b2b2b, 0x082b080819080819,
  452. 0x082b080819081908, 0x082b080819190808, 0x082b08081919082b, 0x082b080819191919,
  453. 0x082b0808192b1908, 0x082b08082b080808, 0x082b08082b082b2b, 0x082b08082b191908,
  454. 0x082b08082b2b2b2b, 0x082b081908080819, 0x082b081908081908, 0x082b081908190808,
  455. 0x082b08190819082b, 0x082b081908191919, 0x082b0819082b0819, 0x082b081919080808,
  456. 0x082b08191908082b, 0x082b081919081919, 0x082b081919190819, 0x082b081919191908,
  457. 0x082b0819192b0808, 0x082b08192b080819, 0x082b08192b081908, 0x082b08192b190808,
  458. 0x082b082b08080808, 0x082b082b08082b2b, 0x082b082b082b082b, 0x082b082b082b2b08,
  459. 0x082b082b082b2b2b, 0x082b082b19081908, 0x082b082b19190808, 0x082b082b2b082b08,
  460. 0x082b082b2b082b2b, 0x082b082b2b2b2b08, 0x082b190808080819, 0x082b190808081908,
  461. 0x082b19080808192b, 0x082b190808082b19, 0x082b190808190808, 0x082b190808191919,
  462. 0x082b190808192b08, 0x082b1908082b0819, 0x082b1908082b1908, 0x082b190819080808,
  463. 0x082b19081908082b, 0x082b190819081919, 0x082b190819082b08, 0x082b190819190819,
  464. 0x082b190819191908, 0x082b1908192b0808, 0x082b19082b080819, 0x082b19082b081908,
  465. 0x082b19082b190808, 0x082b191908080808, 0x082b191908081919, 0x082b191908082b08,
  466. 0x082b191908190819, 0x082b191908191908, 0x082b1919082b0808, 0x082b191919080819,
  467. 0x082b191919081908, 0x082b191919190808, 0x082b1919192b192b, 0x082b19192b080808,
  468. 0x082b192b08080819, 0x082b192b08081908, 0x082b192b08190808, 0x082b192b19080808,
  469. 0x082b192b19192b19, 0x082b2b0808080808, 0x082b2b0808081919, 0x082b2b0808190819,
  470. 0x082b2b0808191908, 0x082b2b0819080819, 0x082b2b0819081908, 0x082b2b0819190808,
  471. 0x082b2b082b082b2b, 0x082b2b082b2b2b2b, 0x082b2b1908080819, 0x082b2b1908081908,
  472. 0x082b2b1908190808, 0x082b2b192b191919, 0x082b2b2b08082b2b, 0x082b2b2b082b082b,
  473. 0x082b2b2b192b1908, 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819,
  474. 0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808,
  475. 0x190808080819082b, 0x1908080808191919, 0x1908080808192b08, 0x1908080808192b2b,
  476. 0x19080808082b0819, 0x19080808082b1908, 0x19080808082b192b, 0x1908080819080808,
  477. 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, 0x1908080819082b2b,
  478. 0x1908080819190819, 0x1908080819191908, 0x190808081919192b, 0x1908080819192b19,
  479. 0x19080808192b0808, 0x19080808192b082b, 0x19080808192b1919, 0x190808082b080819,
  480. 0x190808082b081908, 0x190808082b190808, 0x190808082b191919, 0x190808082b192b08,
  481. 0x190808082b2b0819, 0x190808082b2b1908, 0x1908081908080808, 0x190808190808082b,
  482. 0x1908081908081919, 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908,
  483. 0x190808190819192b, 0x1908081908192b19, 0x19080819082b0808, 0x19080819082b082b,
  484. 0x19080819082b1919, 0x1908081919080819, 0x1908081919081908, 0x190808191908192b,
  485. 0x1908081919082b19, 0x1908081919190808, 0x190808191919082b, 0x1908081919191919,
  486. 0x1908081919192b08, 0x19080819192b0819, 0x19080819192b1908, 0x190808192b080808,
  487. 0x190808192b08082b, 0x190808192b081919, 0x190808192b082b08, 0x190808192b190819,
  488. 0x190808192b191908, 0x190808192b2b0808, 0x1908082b08080819, 0x1908082b08081908,
  489. 0x1908082b08190808, 0x1908082b0819082b, 0x1908082b08191919, 0x1908082b08192b08,
  490. 0x1908082b082b1908, 0x1908082b19080808, 0x1908082b19081919, 0x1908082b19082b08,
  491. 0x1908082b19190819, 0x1908082b19191908, 0x1908082b192b0808, 0x1908082b2b080819,
  492. 0x1908082b2b081908, 0x1908190808080808, 0x190819080808082b, 0x1908190808081919,
  493. 0x1908190808082b08, 0x1908190808082b2b, 0x1908190808190819, 0x1908190808191908,
  494. 0x190819080819192b, 0x1908190808192b19, 0x19081908082b0808, 0x19081908082b082b,
  495. 0x19081908082b1919, 0x19081908082b2b08, 0x1908190819080819, 0x1908190819081908,
  496. 0x190819081908192b, 0x1908190819082b19, 0x1908190819190808, 0x190819081919082b,
  497. 0x1908190819191919, 0x1908190819192b08, 0x19081908192b0819, 0x19081908192b1908,
  498. 0x190819082b080808, 0x190819082b08082b, 0x190819082b081919, 0x190819082b082b08,
  499. 0x190819082b190819, 0x190819082b191908, 0x190819082b2b0808, 0x1908191908080819,
  500. 0x1908191908081908, 0x190819190808192b, 0x1908191908082b19, 0x1908191908190808,
  501. 0x190819190819082b, 0x1908191908191919, 0x1908191908192b08, 0x19081919082b0819,
  502. 0x19081919082b1908, 0x1908191919080808, 0x190819191908082b, 0x1908191919081919,
  503. 0x1908191919082b08, 0x1908191919190819, 0x1908191919191908, 0x19081919192b0808,
  504. 0x19081919192b2b2b, 0x190819192b080819, 0x190819192b081908, 0x190819192b190808,
  505. 0x1908192b08080808, 0x1908192b0808082b, 0x1908192b08081919, 0x1908192b08082b08,
  506. 0x1908192b08190819, 0x1908192b08191908, 0x1908192b082b0808, 0x1908192b19080819,
  507. 0x1908192b19081908, 0x1908192b19190808, 0x1908192b2b080808, 0x1908192b2b2b1919,
  508. 0x19082b0808080819, 0x19082b0808081908, 0x19082b0808082b19, 0x19082b0808190808,
  509. 0x19082b080819082b, 0x19082b0808191919, 0x19082b0808192b08, 0x19082b08082b0819,
  510. 0x19082b08082b1908, 0x19082b0819080808, 0x19082b081908082b, 0x19082b0819081919,
  511. 0x19082b0819082b08, 0x19082b0819190819, 0x19082b0819191908, 0x19082b08192b0808,
  512. 0x19082b082b081908, 0x19082b082b190808, 0x19082b1908080808, 0x19082b190808082b,
  513. 0x19082b1908081919, 0x19082b1908082b08, 0x19082b1908190819, 0x19082b1908191908,
  514. 0x19082b19082b0808, 0x19082b1919080819, 0x19082b1919081908, 0x19082b1919190808,
  515. 0x19082b192b080808, 0x19082b192b19192b, 0x19082b2b08080819, 0x19082b2b08081908,
  516. 0x19082b2b08190808, 0x19082b2b19080808, 0x1919080808080808, 0x191908080808082b,
  517. 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, 0x1919080808191908,
  518. 0x191908080819192b, 0x1919080808192b19, 0x19190808082b0808, 0x19190808082b082b,
  519. 0x19190808082b1919, 0x19190808082b2b08, 0x1919080819080819, 0x1919080819081908,
  520. 0x191908081908192b, 0x1919080819082b19, 0x1919080819190808, 0x191908081919082b,
  521. 0x1919080819191919, 0x1919080819192b08, 0x19190808192b0819, 0x19190808192b1908,
  522. 0x191908082b080808, 0x191908082b08082b, 0x191908082b081919, 0x191908082b082b08,
  523. 0x191908082b190819, 0x191908082b191908, 0x1919081908080819, 0x1919081908081908,
  524. 0x191908190808192b, 0x1919081908082b19, 0x1919081908190808, 0x191908190819082b,
  525. 0x1919081908191919, 0x1919081908192b08, 0x19190819082b0819, 0x19190819082b1908,
  526. 0x1919081919080808, 0x191908191908082b, 0x1919081919081919, 0x1919081919082b08,
  527. 0x1919081919190819, 0x1919081919191908, 0x19190819192b0808, 0x191908192b080819,
  528. 0x191908192b081908, 0x191908192b190808, 0x1919082b08080808, 0x1919082b08081919,
  529. 0x1919082b08082b08, 0x1919082b08190819, 0x1919082b08191908, 0x1919082b082b0808,
  530. 0x1919082b19080819, 0x1919082b19081908, 0x1919082b19190808, 0x1919082b192b2b19,
  531. 0x1919082b2b080808, 0x1919190808080819, 0x1919190808081908, 0x191919080808192b,
  532. 0x1919190808082b19, 0x1919190808190808, 0x191919080819082b, 0x1919190808191919,
  533. 0x1919190808192b08, 0x19191908082b0819, 0x19191908082b1908, 0x1919190819080808,
  534. 0x191919081908082b, 0x1919190819081919, 0x1919190819082b08, 0x1919190819190819,
  535. 0x1919190819191908, 0x19191908192b0808, 0x191919082b080819, 0x191919082b081908,
  536. 0x191919082b190808, 0x1919191908080808, 0x191919190808082b, 0x1919191908081919,
  537. 0x1919191908082b08, 0x1919191908190819, 0x1919191908191908, 0x19191919082b0808,
  538. 0x1919191919080819, 0x1919191919081908, 0x1919191919190808, 0x191919192b080808,
  539. 0x1919192b08080819, 0x1919192b08081908, 0x1919192b08190808, 0x1919192b082b192b,
  540. 0x1919192b19080808, 0x19192b0808080808, 0x19192b080808082b, 0x19192b0808081919,
  541. 0x19192b0808082b08, 0x19192b0808190819, 0x19192b0808191908, 0x19192b08082b0808,
  542. 0x19192b0819080819, 0x19192b0819081908, 0x19192b0819190808, 0x19192b0819192b2b,
  543. 0x19192b082b080808, 0x19192b1908080819, 0x19192b1908081908, 0x19192b1908190808,
  544. 0x19192b1919080808, 0x19192b2b08080808, 0x19192b2b08192b19, 0x19192b2b2b081919,
  545. 0x19192b2b2b2b2b08, 0x192b080808080819, 0x192b080808081908, 0x192b08080808192b,
  546. 0x192b080808190808, 0x192b08080819082b, 0x192b080808191919, 0x192b080808192b08,
  547. 0x192b0808082b0819, 0x192b0808082b1908, 0x192b080819080808, 0x192b080819081919,
  548. 0x192b080819082b08, 0x192b080819190819, 0x192b080819191908, 0x192b0808192b0808,
  549. 0x192b08082b081908, 0x192b08082b190808, 0x192b081908080808, 0x192b08190808082b,
  550. 0x192b081908081919, 0x192b081908082b08, 0x192b081908190819, 0x192b081908191908,
  551. 0x192b0819082b0808, 0x192b081919080819, 0x192b081919081908, 0x192b081919190808,
  552. 0x192b08192b080808, 0x192b08192b192b19, 0x192b082b08081908, 0x192b082b08190808,
  553. 0x192b082b19080808, 0x192b082b1919192b, 0x192b082b2b2b0819, 0x192b190808080808,
  554. 0x192b190808081919, 0x192b190808082b08, 0x192b190808190819, 0x192b190808191908,
  555. 0x192b1908082b0808, 0x192b190819080819, 0x192b190819081908, 0x192b190819190808,
  556. 0x192b19082b080808, 0x192b191908080819, 0x192b191908081908, 0x192b191908190808,
  557. 0x192b191919080808, 0x192b191919082b2b, 0x192b1919192b2b08, 0x192b19192b19082b,
  558. 0x192b192b08080808, 0x192b192b2b191908, 0x192b2b0808080819, 0x192b2b0808081908,
  559. 0x192b2b0808190808, 0x192b2b08192b1919, 0x192b2b082b192b08, 0x192b2b1908080808,
  560. 0x192b2b19082b2b2b, 0x192b2b2b1908082b, 0x192b2b2b2b2b0819, 0x2b08080808080808,
  561. 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819,
  562. 0x2b08080808191908, 0x2b08080808192b19, 0x2b080808082b0808, 0x2b080808082b1919,
  563. 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808081919082b,
  564. 0x2b08080819191919, 0x2b08080819192b08, 0x2b080808192b0819, 0x2b0808082b080808,
  565. 0x2b0808082b081919, 0x2b0808082b190819, 0x2b0808082b191908, 0x2b08081908080819,
  566. 0x2b08081908081908, 0x2b08081908082b19, 0x2b08081908190808, 0x2b0808190819082b,
  567. 0x2b08081908191919, 0x2b08081908192b08, 0x2b080819082b0819, 0x2b080819082b1908,
  568. 0x2b08081919080808, 0x2b0808191908082b, 0x2b08081919081919, 0x2b08081919082b08,
  569. 0x2b08081919190819, 0x2b08081919191908, 0x2b0808192b080819, 0x2b0808192b081908,
  570. 0x2b0808192b190808, 0x2b0808192b2b2b19, 0x2b08082b08080808, 0x2b08082b08081919,
  571. 0x2b08082b08082b2b, 0x2b08082b08190819, 0x2b08082b08191908, 0x2b08082b19080819,
  572. 0x2b08082b19081908, 0x2b08082b19190808, 0x2b08190808080819, 0x2b08190808081908,
  573. 0x2b0819080808192b, 0x2b08190808082b19, 0x2b08190808190808, 0x2b0819080819082b,
  574. 0x2b08190808191919, 0x2b08190808192b08, 0x2b081908082b0819, 0x2b08190819080808,
  575. 0x2b0819081908082b, 0x2b08190819081919, 0x2b08190819082b08, 0x2b08190819190819,
  576. 0x2b08190819191908, 0x2b081908192b0808, 0x2b0819082b080819, 0x2b0819082b081908,
  577. 0x2b0819082b190808, 0x2b08191908080808, 0x2b0819190808082b, 0x2b08191908081919,
  578. 0x2b08191908082b08, 0x2b08191908190819, 0x2b08191908191908, 0x2b081919082b0808,
  579. 0x2b08191919080819, 0x2b08191919081908, 0x2b08191919190808, 0x2b0819192b080808,
  580. 0x2b0819192b082b2b, 0x2b08192b08080819, 0x2b08192b08081908, 0x2b08192b08190808,
  581. 0x2b08192b082b2b19, 0x2b08192b19080808, 0x2b082b0808080808, 0x2b082b0808081919,
  582. 0x2b082b0808190819, 0x2b082b0808191908, 0x2b082b0819080819, 0x2b082b0819081908,
  583. 0x2b082b0819190808, 0x2b082b082b2b082b, 0x2b082b1908080819, 0x2b082b1908081908,
  584. 0x2b082b1919080808, 0x2b082b19192b1919, 0x2b082b2b082b082b, 0x2b082b2b19192b08,
  585. 0x2b082b2b19192b2b, 0x2b082b2b2b08082b, 0x2b082b2b2b2b082b, 0x2b19080808080819,
  586. 0x2b19080808081908, 0x2b19080808082b19, 0x2b19080808190808, 0x2b1908080819082b,
  587. 0x2b19080808191919, 0x2b19080808192b08, 0x2b190808082b1908, 0x2b19080819080808,
  588. 0x2b1908081908082b, 0x2b19080819081919, 0x2b19080819082b08, 0x2b19080819190819,
  589. 0x2b19080819191908, 0x2b190808192b0808, 0x2b1908082b080819, 0x2b1908082b081908,
  590. 0x2b1908082b190808, 0x2b19081908080808, 0x2b19081908081919, 0x2b19081908190819,
  591. 0x2b19081908191908, 0x2b19081919080819, 0x2b19081919081908, 0x2b19081919190808,
  592. 0x2b19081919192b2b, 0x2b19082b08080819, 0x2b19082b08081908, 0x2b19082b08190808,
  593. 0x2b19082b19080808, 0x2b19082b2b2b192b, 0x2b19190808080808, 0x2b1919080808082b,
  594. 0x2b19190808081919, 0x2b19190808082b08, 0x2b19190808190819, 0x2b19190808191908,
  595. 0x2b191908082b0808, 0x2b19190819080819, 0x2b19190819081908, 0x2b19190819190808,
  596. 0x2b1919082b080808, 0x2b1919082b19192b, 0x2b19191908080819, 0x2b19191908081908,
  597. 0x2b19191908190808, 0x2b19191919080808, 0x2b1919192b192b08, 0x2b1919192b2b0819,
  598. 0x2b19192b08080808, 0x2b19192b1908192b, 0x2b19192b192b1908, 0x2b192b0808080819,
  599. 0x2b192b0808081908, 0x2b192b0808190808, 0x2b192b08082b192b, 0x2b192b0819080808,
  600. 0x2b192b082b2b2b19, 0x2b192b1908080808, 0x2b192b1919082b19, 0x2b192b191919082b,
  601. 0x2b192b2b2b190808, 0x2b2b080808080808, 0x2b2b080808081919, 0x2b2b080808082b2b,
  602. 0x2b2b080808191908, 0x2b2b0808082b082b, 0x2b2b0808082b2b2b, 0x2b2b080819080819,
  603. 0x2b2b080819081908, 0x2b2b080819190808, 0x2b2b08082b2b082b, 0x2b2b08082b2b2b2b,
  604. 0x2b2b081919080808, 0x2b2b0819192b1919, 0x2b2b082b0808082b, 0x2b2b082b08082b2b,
  605. 0x2b2b082b082b082b, 0x2b2b082b082b2b08, 0x2b2b082b082b2b2b, 0x2b2b082b2b08082b,
  606. 0x2b2b082b2b082b08, 0x2b2b082b2b082b2b, 0x2b2b082b2b2b2b08, 0x2b2b190808080819,
  607. 0x2b2b190808081908, 0x2b2b190808190808, 0x2b2b190819080808, 0x2b2b19082b082b19,
  608. 0x2b2b19082b2b1908, 0x2b2b191908080808, 0x2b2b191908192b19, 0x2b2b192b19190819,
  609. 0x2b2b2b0808082b2b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b082b, 0x2b2b2b1919191908,
  610. 0x2b2b2b192b08192b, 0x2b2b2b2b08082b08, 0x2b2b2b2b08082b2b, 0x2b2b2b2b082b0808,
  611. 0x2b2b2b2b082b082b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b2b082b08, 0x2b2b2b2b2b2b2b2b,
  612. };
  613. static const __device__ uint32_t iq3xxs_grid[256] = {
  614. 0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
  615. 0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
  616. 0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404,
  617. 0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e,
  618. 0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c,
  619. 0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c,
  620. 0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34,
  621. 0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c,
  622. 0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c,
  623. 0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04,
  624. 0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c,
  625. 0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414,
  626. 0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434,
  627. 0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c,
  628. 0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e,
  629. 0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24,
  630. 0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24,
  631. 0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c,
  632. 0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c,
  633. 0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14,
  634. 0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414,
  635. 0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e,
  636. 0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404,
  637. 0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c,
  638. 0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c,
  639. 0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14,
  640. 0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c,
  641. 0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c,
  642. 0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14,
  643. 0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14,
  644. 0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c,
  645. 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
  646. };
  647. static const __device__ uint32_t iq3xs_grid[512] = {
  648. 0x04040404, 0x0404040c, 0x04040414, 0x0404042c, 0x0404043e, 0x04040c04, 0x04040c0c, 0x04040c14,
  649. 0x04040c24, 0x04040c34, 0x04041404, 0x0404140c, 0x0404142c, 0x04041c1c, 0x04042404, 0x04042414,
  650. 0x0404242c, 0x0404243e, 0x04042c0c, 0x04042c1c, 0x04043404, 0x04043414, 0x04043e0c, 0x04043e24,
  651. 0x04043e3e, 0x040c0404, 0x040c040c, 0x040c0414, 0x040c0424, 0x040c0c04, 0x040c0c0c, 0x040c0c2c,
  652. 0x040c1404, 0x040c141c, 0x040c143e, 0x040c1c0c, 0x040c1c2c, 0x040c2424, 0x040c340c, 0x040c342c,
  653. 0x040c3e14, 0x04140404, 0x0414040c, 0x0414042c, 0x0414043e, 0x04140c04, 0x04140c1c, 0x04140c34,
  654. 0x0414140c, 0x0414142c, 0x04141c04, 0x04141c24, 0x04142414, 0x0414242c, 0x0414243e, 0x04142c0c,
  655. 0x04142c1c, 0x04143e04, 0x04143e1c, 0x041c041c, 0x041c0c0c, 0x041c0c2c, 0x041c1404, 0x041c1414,
  656. 0x041c1c0c, 0x041c1c1c, 0x041c1c34, 0x041c2424, 0x041c2c04, 0x041c2c14, 0x041c343e, 0x041c3e0c,
  657. 0x041c3e2c, 0x04240404, 0x04240c1c, 0x04240c3e, 0x0424140c, 0x04241424, 0x04241c14, 0x04242404,
  658. 0x0424241c, 0x04242c0c, 0x04243e04, 0x042c0414, 0x042c0424, 0x042c1404, 0x042c1414, 0x042c1434,
  659. 0x042c1c1c, 0x042c240c, 0x042c242c, 0x042c243e, 0x042c3434, 0x042c3e1c, 0x04340434, 0x04340c0c,
  660. 0x04340c1c, 0x04341c0c, 0x04342c14, 0x04343e0c, 0x043e0404, 0x043e0414, 0x043e0424, 0x043e1404,
  661. 0x043e1414, 0x043e1434, 0x043e1c1c, 0x043e2c04, 0x043e2c24, 0x0c040404, 0x0c04040c, 0x0c040414,
  662. 0x0c040424, 0x0c040c04, 0x0c040c0c, 0x0c040c1c, 0x0c040c2c, 0x0c040c3e, 0x0c041404, 0x0c041414,
  663. 0x0c041c0c, 0x0c041c24, 0x0c041c34, 0x0c042c24, 0x0c042c34, 0x0c04340c, 0x0c043e14, 0x0c0c0404,
  664. 0x0c0c040c, 0x0c0c041c, 0x0c0c0434, 0x0c0c0c04, 0x0c0c0c24, 0x0c0c140c, 0x0c0c1c04, 0x0c0c1c1c,
  665. 0x0c0c240c, 0x0c0c2c04, 0x0c0c2c14, 0x0c0c3e04, 0x0c0c3e34, 0x0c140404, 0x0c140c14, 0x0c140c2c,
  666. 0x0c140c3e, 0x0c141404, 0x0c141424, 0x0c141c14, 0x0c142404, 0x0c14241c, 0x0c142c2c, 0x0c143404,
  667. 0x0c143e14, 0x0c1c040c, 0x0c1c0424, 0x0c1c043e, 0x0c1c0c04, 0x0c1c0c1c, 0x0c1c140c, 0x0c1c143e,
  668. 0x0c1c1c04, 0x0c1c1c24, 0x0c1c240c, 0x0c1c3414, 0x0c1c3e04, 0x0c24041c, 0x0c24042c, 0x0c240c14,
  669. 0x0c240c24, 0x0c241c0c, 0x0c241c1c, 0x0c242414, 0x0c242434, 0x0c242c04, 0x0c242c24, 0x0c2c040c,
  670. 0x0c2c0c04, 0x0c2c0c1c, 0x0c2c140c, 0x0c2c1c04, 0x0c2c1c14, 0x0c2c2c0c, 0x0c341404, 0x0c341424,
  671. 0x0c34143e, 0x0c342424, 0x0c342434, 0x0c3e040c, 0x0c3e041c, 0x0c3e0c04, 0x0c3e0c14, 0x0c3e140c,
  672. 0x0c3e1c2c, 0x0c3e240c, 0x0c3e3414, 0x0c3e3e04, 0x14040404, 0x1404040c, 0x1404041c, 0x1404042c,
  673. 0x1404043e, 0x14040c04, 0x14040c14, 0x14040c24, 0x14040c34, 0x1404140c, 0x1404141c, 0x1404143e,
  674. 0x14041c04, 0x14041c14, 0x1404240c, 0x1404241c, 0x1404242c, 0x14042c04, 0x14042c14, 0x1404343e,
  675. 0x14043e04, 0x14043e1c, 0x14043e2c, 0x140c0404, 0x140c0414, 0x140c0c04, 0x140c0c1c, 0x140c0c3e,
  676. 0x140c1414, 0x140c142c, 0x140c1c0c, 0x140c1c24, 0x140c2414, 0x140c2c0c, 0x1414040c, 0x14140424,
  677. 0x1414043e, 0x1414140c, 0x1414141c, 0x14141c04, 0x14141c3e, 0x1414240c, 0x14142c1c, 0x14142c3e,
  678. 0x14143e0c, 0x14143e24, 0x141c0404, 0x141c0414, 0x141c042c, 0x141c0c0c, 0x141c1414, 0x141c1424,
  679. 0x141c1c0c, 0x141c1c1c, 0x141c2414, 0x141c2c04, 0x141c3434, 0x1424040c, 0x1424043e, 0x14241404,
  680. 0x1424141c, 0x14241c14, 0x14241c2c, 0x1424240c, 0x14243e14, 0x14243e2c, 0x142c0424, 0x142c0c0c,
  681. 0x142c1414, 0x142c1c3e, 0x142c2404, 0x142c2c1c, 0x142c3e04, 0x14340404, 0x14340414, 0x1434043e,
  682. 0x1434140c, 0x14342c2c, 0x1434340c, 0x143e042c, 0x143e0c0c, 0x143e1434, 0x143e1c04, 0x143e241c,
  683. 0x143e2c04, 0x1c040414, 0x1c040c0c, 0x1c040c1c, 0x1c040c2c, 0x1c040c3e, 0x1c041414, 0x1c041c0c,
  684. 0x1c041c1c, 0x1c041c2c, 0x1c042414, 0x1c042424, 0x1c04243e, 0x1c042c0c, 0x1c04341c, 0x1c043e0c,
  685. 0x1c0c040c, 0x1c0c041c, 0x1c0c042c, 0x1c0c0c24, 0x1c0c140c, 0x1c0c141c, 0x1c0c2404, 0x1c0c3404,
  686. 0x1c0c3e14, 0x1c0c3e34, 0x1c140404, 0x1c140c14, 0x1c141404, 0x1c141c14, 0x1c141c24, 0x1c142c04,
  687. 0x1c1c040c, 0x1c1c0c04, 0x1c1c0c24, 0x1c1c140c, 0x1c1c141c, 0x1c1c143e, 0x1c1c1c04, 0x1c1c240c,
  688. 0x1c1c241c, 0x1c1c243e, 0x1c1c2c2c, 0x1c1c3e1c, 0x1c24041c, 0x1c240c0c, 0x1c240c34, 0x1c241414,
  689. 0x1c241c0c, 0x1c242c14, 0x1c243404, 0x1c243424, 0x1c2c040c, 0x1c2c0c04, 0x1c2c0c14, 0x1c2c142c,
  690. 0x1c2c1c14, 0x1c2c2424, 0x1c2c2c34, 0x1c2c3e1c, 0x1c340c34, 0x1c34240c, 0x1c3e040c, 0x1c3e041c,
  691. 0x1c3e1404, 0x1c3e1414, 0x1c3e1c2c, 0x24040404, 0x24040424, 0x24040c14, 0x24041404, 0x24041424,
  692. 0x2404143e, 0x24041c14, 0x2404240c, 0x24042c04, 0x24043e04, 0x240c0414, 0x240c043e, 0x240c0c0c,
  693. 0x240c0c1c, 0x240c1414, 0x240c1c04, 0x240c1c2c, 0x240c241c, 0x240c2c0c, 0x240c2c2c, 0x2414040c,
  694. 0x2414041c, 0x24140c04, 0x24140c2c, 0x2414140c, 0x24141c1c, 0x24142404, 0x24142c3e, 0x24143414,
  695. 0x24143e04, 0x241c0424, 0x241c0c0c, 0x241c0c1c, 0x241c1404, 0x241c1414, 0x241c1c0c, 0x241c1c2c,
  696. 0x24240404, 0x24240414, 0x24241424, 0x24241c3e, 0x24242404, 0x24243e0c, 0x242c042c, 0x242c043e,
  697. 0x242c140c, 0x242c3414, 0x24340c1c, 0x24341c24, 0x24343404, 0x243e0c04, 0x243e0c2c, 0x243e1c04,
  698. 0x243e241c, 0x243e2c0c, 0x2c040414, 0x2c040c04, 0x2c040c24, 0x2c041414, 0x2c042404, 0x2c042424,
  699. 0x2c04243e, 0x2c042c14, 0x2c043434, 0x2c043e24, 0x2c0c040c, 0x2c0c041c, 0x2c0c042c, 0x2c0c0c14,
  700. 0x2c0c140c, 0x2c0c1c14, 0x2c0c3e14, 0x2c140404, 0x2c140c0c, 0x2c14141c, 0x2c141c04, 0x2c141c34,
  701. 0x2c142c1c, 0x2c1c0414, 0x2c1c043e, 0x2c1c0c04, 0x2c1c143e, 0x2c1c2424, 0x2c1c2c0c, 0x2c1c342c,
  702. 0x2c1c3e1c, 0x2c24040c, 0x2c240424, 0x2c241404, 0x2c241c14, 0x2c242434, 0x2c2c0c14, 0x2c2c1434,
  703. 0x2c2c2c0c, 0x2c2c2c1c, 0x2c342414, 0x2c3e0414, 0x2c3e0424, 0x2c3e1414, 0x34040c0c, 0x34040c1c,
  704. 0x34040c2c, 0x34041c0c, 0x34041c1c, 0x34043404, 0x340c0404, 0x340c1404, 0x340c143e, 0x340c3424,
  705. 0x34140c14, 0x34141c24, 0x34142414, 0x34142c2c, 0x34143414, 0x34143e04, 0x341c0404, 0x341c0c24,
  706. 0x341c140c, 0x341c2404, 0x3424142c, 0x3424241c, 0x34243414, 0x342c0404, 0x342c041c, 0x342c1c24,
  707. 0x342c3404, 0x3434042c, 0x34342404, 0x343e0c0c, 0x343e0c1c, 0x3e040404, 0x3e040424, 0x3e04043e,
  708. 0x3e041404, 0x3e041414, 0x3e041c34, 0x3e042404, 0x3e042c24, 0x3e043414, 0x3e0c0414, 0x3e0c0c0c,
  709. 0x3e0c1424, 0x3e0c241c, 0x3e0c242c, 0x3e14040c, 0x3e140424, 0x3e140c04, 0x3e140c34, 0x3e14140c,
  710. 0x3e141c04, 0x3e142c0c, 0x3e1c0414, 0x3e1c1c14, 0x3e1c1c2c, 0x3e1c2c1c, 0x3e24040c, 0x3e24042c,
  711. 0x3e240c1c, 0x3e241404, 0x3e242c04, 0x3e2c1414, 0x3e2c2414, 0x3e340414, 0x3e341c0c, 0x3e3e0404,
  712. };
  713. static const __device__ uint64_t iq1s_grid[512] = {
  714. 0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000,
  715. 0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01,
  716. 0xffffff00ff000000, 0xffffff000000ff00, 0xffffff00000000ff, 0xffffff0000000100,
  717. 0xffffff0000010000, 0xffffff0001000000, 0xffffff01ffff00ff, 0xffffff01ff01ff00,
  718. 0xffffff01ff010100, 0xffffff0100000001, 0xffffff0101ffff00, 0xffffff0101ff0101,
  719. 0xffffff0101010100, 0xffff00ffff00ff01, 0xffff00ffff0000ff, 0xffff00ff00ff0100,
  720. 0xffff00ff0100ff00, 0xffff00ff010001ff, 0xffff0000ff0101ff, 0xffff000000ffff00,
  721. 0xffff000000000000, 0xffff00000001ff01, 0xffff000001000101, 0xffff0000010100ff,
  722. 0xffff0001ffff0100, 0xffff00010000ff00, 0xffff000100010101, 0xffff000101000000,
  723. 0xffff01ffffff0000, 0xffff01ffff01ffff, 0xffff01ffff010100, 0xffff01ff00000000,
  724. 0xffff01ff01ffffff, 0xffff01ff01ff0001, 0xffff01ff0101ffff, 0xffff01ff01010001,
  725. 0xffff0100ffffff01, 0xffff01000000ffff, 0xffff010000000100, 0xffff010001ff01ff,
  726. 0xffff010001000000, 0xffff0101ff000000, 0xffff0101000101ff, 0xffff010101ffff01,
  727. 0xffff01010101ff00, 0xff00ffffff000000, 0xff00ffff00ffff00, 0xff00ffff00000001,
  728. 0xff00ffff000001ff, 0xff00ffff01010000, 0xff00ff00ffff0000, 0xff00ff00ff00ff00,
  729. 0xff00ff00ff0000ff, 0xff00ff00ff000100, 0xff00ff00ff010001, 0xff00ff0000ff0001,
  730. 0xff00ff000000ffff, 0xff00ff0000000000, 0xff00ff000001ff00, 0xff00ff0000010100,
  731. 0xff00ff0001ff0000, 0xff00ff000100ff00, 0xff00ff0001000100, 0xff00ff01ff000000,
  732. 0xff00ff0100ff0000, 0xff00ff01000001ff, 0xff00ff0101010001, 0xff0000ff00000000,
  733. 0xff0000ff0001ff00, 0xff0000ff00010100, 0xff000000ffff0101, 0xff000000ff000000,
  734. 0xff000000ff01ff00, 0xff00000000ff0000, 0xff0000000000ff00, 0xff000000000000ff,
  735. 0xff00000000000000, 0xff00000000000001, 0xff00000000000100, 0xff0000000001ffff,
  736. 0xff00000000010000, 0xff00000001000000, 0xff00000001010100, 0xff000001ff00ff01,
  737. 0xff000001ff0100ff, 0xff00000100000000, 0xff0000010001ff00, 0xff00000101ff0100,
  738. 0xff0000010100ff00, 0xff0001ff00ff00ff, 0xff0001ff00000101, 0xff0001ff000100ff,
  739. 0xff0001ff01000000, 0xff000100ff0001ff, 0xff0001000000ff01, 0xff00010000000000,
  740. 0xff00010000010001, 0xff00010000010100, 0xff00010001ffff00, 0xff00010001ff0101,
  741. 0xff00010001010000, 0xff000101ffffffff, 0xff000101ff000101, 0xff00010101ff00ff,
  742. 0xff00010101000001, 0xff000101010100ff, 0xff01ffffff000101, 0xff01ffffff01ffff,
  743. 0xff01ffffff01ff01, 0xff01ffffff0101ff, 0xff01ffff00000000, 0xff01ffff01ff0001,
  744. 0xff01ffff0101ff01, 0xff01ff00ff000000, 0xff01ff0000ff0100, 0xff01ff000000ff01,
  745. 0xff01ff0000010000, 0xff01ff00010000ff, 0xff01ff01ff01ff00, 0xff01ff0100000101,
  746. 0xff0100ffffff0000, 0xff0100ffff010000, 0xff0100ff01ff00ff, 0xff0100ff01000100,
  747. 0xff0100ff010100ff, 0xff010000ffffff01, 0xff01000000000000, 0xff0100000101ff00,
  748. 0xff010001ffff00ff, 0xff010001ff000100, 0xff01000100ffff00, 0xff01000100010001,
  749. 0xff01000101ff0001, 0xff010001010001ff, 0xff0101ffffffffff, 0xff0101ffff01ffff,
  750. 0xff0101ffff010101, 0xff0101ff0000ff00, 0xff0101ff01010001, 0xff010100ff000000,
  751. 0xff010100ff01ff01, 0xff01010000ff0001, 0xff01010000000100, 0xff01010001000000,
  752. 0xff0101010100ffff, 0x00ffffff0000ff01, 0x00ffffff000000ff, 0x00ffffff00000100,
  753. 0x00ffffff00010000, 0x00ffff00ffff0001, 0x00ffff00ff0000ff, 0x00ffff00ff000100,
  754. 0x00ffff0000000000, 0x00ffff0001000100, 0x00ffff0001010001, 0x00ffff01ff00ff01,
  755. 0x00ffff0100ff0100, 0x00ffff010000ff00, 0x00ffff01000100ff, 0x00ffff0101ff00ff,
  756. 0x00ffff010101ff00, 0x00ff00ffffffffff, 0x00ff00ffffff01ff, 0x00ff00ffff000101,
  757. 0x00ff00ff00000000, 0x00ff00ff000101ff, 0x00ff00ff01010101, 0x00ff0000ff000000,
  758. 0x00ff0000ff01ffff, 0x00ff000000ff0000, 0x00ff00000000ff00, 0x00ff0000000000ff,
  759. 0x00ff000000000000, 0x00ff000000000001, 0x00ff000000000100, 0x00ff000000010000,
  760. 0x00ff000001ffff01, 0x00ff000001000000, 0x00ff0001ff000101, 0x00ff000100ffffff,
  761. 0x00ff000100000000, 0x00ff0001010001ff, 0x00ff01ffff000000, 0x00ff01ff0001ff00,
  762. 0x00ff01ff01ff0100, 0x00ff0100ff01ff01, 0x00ff010000ff00ff, 0x00ff010000ff0101,
  763. 0x00ff010000000000, 0x00ff010000010101, 0x00ff01000100ff00, 0x00ff010001010000,
  764. 0x00ff0101ffffff00, 0x00ff01010000ff01, 0x00ff010100000100, 0x00ff010101ff0000,
  765. 0x0000ffffffff0100, 0x0000ffffff00ff00, 0x0000ffffff0000ff, 0x0000ffffff010000,
  766. 0x0000ffff00000000, 0x0000ffff00010101, 0x0000ffff01ffff01, 0x0000ffff01000100,
  767. 0x0000ff00ff000000, 0x0000ff00ff01ff00, 0x0000ff00ff0101ff, 0x0000ff0000ff0000,
  768. 0x0000ff000000ff00, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001,
  769. 0x0000ff0000000100, 0x0000ff0000010000, 0x0000ff0001ffffff, 0x0000ff0001ff01ff,
  770. 0x0000ff0001000000, 0x0000ff000101ffff, 0x0000ff01ffff0101, 0x0000ff01ff010000,
  771. 0x0000ff0100000000, 0x0000ff0101000101, 0x000000ffffff0001, 0x000000ffff000000,
  772. 0x000000ff00ff0000, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000,
  773. 0x000000ff00000001, 0x000000ff00000100, 0x000000ff00010000, 0x000000ff01000000,
  774. 0x000000ff0101ff00, 0x00000000ffff0000, 0x00000000ff00ff00, 0x00000000ff0000ff,
  775. 0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff010000,
  776. 0x0000000000ffff00, 0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001,
  777. 0x0000000000ff0100, 0x000000000000ffff, 0x000000000000ff00, 0x000000000000ff01,
  778. 0x00000000000000ff, 0x0000000000000001, 0x00000000000001ff, 0x0000000000000100,
  779. 0x0000000000000101, 0x000000000001ff00, 0x00000000000100ff, 0x0000000000010000,
  780. 0x0000000000010001, 0x0000000000010100, 0x0000000001ff0000, 0x000000000100ff00,
  781. 0x00000000010000ff, 0x0000000001000000, 0x0000000001000001, 0x0000000001000100,
  782. 0x0000000001010000, 0x00000001ffff01ff, 0x00000001ff000000, 0x0000000100ff0000,
  783. 0x000000010000ff00, 0x00000001000000ff, 0x0000000100000000, 0x0000000100000001,
  784. 0x0000000100000100, 0x0000000100010000, 0x0000000101000000, 0x000001ffff00ff00,
  785. 0x000001ffff010001, 0x000001ffff0101ff, 0x000001ff00ffff01, 0x000001ff0000ffff,
  786. 0x000001ff00000000, 0x000001ff010000ff, 0x000001ff01010100, 0x00000100ffff0100,
  787. 0x00000100ff000000, 0x0000010000ff0000, 0x000001000000ff00, 0x00000100000000ff,
  788. 0x0000010000000000, 0x0000010000000001, 0x0000010000000100, 0x0000010000010000,
  789. 0x0000010001000000, 0x000001000101ff01, 0x00000101ffff0001, 0x00000101ff01ffff,
  790. 0x0000010100000000, 0x0000010101010100, 0x0001ffffff000000, 0x0001ffff00ffffff,
  791. 0x0001ffff00000100, 0x0001ffff0001ff00, 0x0001ffff01000000, 0x0001ff00ffffff00,
  792. 0x0001ff00ffff01ff, 0x0001ff00ff010000, 0x0001ff0000000000, 0x0001ff0000010001,
  793. 0x0001ff0001ff0000, 0x0001ff0001010100, 0x0001ff01ff0000ff, 0x0001ff01ff000001,
  794. 0x0001ff0100ffffff, 0x0001ff010001ffff, 0x0001ff01000101ff, 0x0001ff010100ff01,
  795. 0x000100ffff00ffff, 0x000100ffff00ff01, 0x000100ffff000100, 0x000100ff00000000,
  796. 0x000100ff000101ff, 0x000100ff01ff0101, 0x000100ff0100ffff, 0x000100ff01010101,
  797. 0x00010000ff000000, 0x00010000ff010100, 0x0001000000ff0000, 0x000100000000ff00,
  798. 0x00010000000000ff, 0x0001000000000000, 0x0001000000000001, 0x0001000000000100,
  799. 0x0001000000010000, 0x0001000001ffff01, 0x0001000001000000, 0x0001000100ff0101,
  800. 0x0001000100000000, 0x00010001010100ff, 0x000101ffffff01ff, 0x000101ffffff0101,
  801. 0x000101ff00010000, 0x000101ff01ff0000, 0x000101ff0100ff01, 0x00010100ffff0000,
  802. 0x0001010000000000, 0x000101000001ffff, 0x0001010000010101, 0x00010100010001ff,
  803. 0x00010101ff00ff00, 0x00010101ff010001, 0x0001010100ffffff, 0x0001010100ff01ff,
  804. 0x00010101000101ff, 0x0001010101ff0000, 0x000101010100ff01, 0x0001010101000101,
  805. 0x01ffffffffff0101, 0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff,
  806. 0x01ffffffff010101, 0x01ffffff00000000, 0x01ffffff01ff01ff, 0x01ffffff01000101,
  807. 0x01ffffff0101ff01, 0x01ffffff010100ff, 0x01ffff000000ff00, 0x01ffff0000000001,
  808. 0x01ffff00000001ff, 0x01ffff0000010000, 0x01ffff0001ff0000, 0x01ffff01ffffffff,
  809. 0x01ffff01ffff01ff, 0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff0101ff,
  810. 0x01ffff010100ffff, 0x01ff00ffffff0000, 0x01ff00ffff010000, 0x01ff00ff00ffff01,
  811. 0x01ff0000ff0000ff, 0x01ff000000000000, 0x01ff00000001ff01, 0x01ff000001ffffff,
  812. 0x01ff000001010100, 0x01ff0001ffffff01, 0x01ff0001ff010001, 0x01ff000101ff0100,
  813. 0x01ff000101000001, 0x01ff0001010100ff, 0x01ff01ffff00ffff, 0x01ff01ff00010001,
  814. 0x01ff01ff01000000, 0x01ff01ff010101ff, 0x01ff0100ff000001, 0x01ff010000ffff00,
  815. 0x01ff010000000100, 0x01ff010001ff01ff, 0x01ff01000101ffff, 0x01ff0101ffff00ff,
  816. 0x01ff0101ffff0101, 0x01ff0101ff0101ff, 0x01ff010100010000, 0x0100ffff00ff00ff,
  817. 0x0100ffff00ff0001, 0x0100ffff00000100, 0x0100ffff0100ff00, 0x0100ff00ffff0000,
  818. 0x0100ff00ff00ffff, 0x0100ff00ff00ff01, 0x0100ff00ff000100, 0x0100ff00ff010000,
  819. 0x0100ff0000000000, 0x0100ff00000100ff, 0x0100ff0001ff0101, 0x0100ff0001010101,
  820. 0x0100ff0100ff00ff, 0x0100ff0100ff0001, 0x0100ff0100000100, 0x0100ff0100010001,
  821. 0x0100ff0101000000, 0x010000ffff00ff00, 0x010000ff0000ffff, 0x010000ff00000000,
  822. 0x010000ff010001ff, 0x010000ff01010001, 0x01000000ffffff00, 0x01000000ffff0101,
  823. 0x01000000ff000000, 0x01000000ff0100ff, 0x01000000ff010101, 0x0100000000ff0000,
  824. 0x010000000000ff00, 0x01000000000000ff, 0x0100000000000000, 0x0100000000000001,
  825. 0x0100000000000100, 0x0100000000010000, 0x0100000001000000, 0x0100000100000000,
  826. 0x01000001000101ff, 0x0100000101ffff01, 0x010001ffff000101, 0x010001ff00ff0100,
  827. 0x010001ff0000ff00, 0x010001ff000100ff, 0x010001ff01ffffff, 0x01000100ffff0000,
  828. 0x01000100ff0001ff, 0x0100010000000000, 0x010001000001ff00, 0x0100010001ff0000,
  829. 0x01000100010000ff, 0x0100010001000101, 0x01000101ff00ff01, 0x0100010100ff0100,
  830. 0x010001010000ffff, 0x0100010101010001, 0x0101ffffffff0101, 0x0101ffffff0001ff,
  831. 0x0101ffffff01ffff, 0x0101ffffff010101, 0x0101ffff00000000, 0x0101ffff0101ffff,
  832. 0x0101ffff010101ff, 0x0101ff00ff000000, 0x0101ff0000ff0100, 0x0101ff000000ff00,
  833. 0x0101ff0000010000, 0x0101ff00010000ff, 0x0101ff0001000001, 0x0101ff01ff010101,
  834. 0x0101ff0100000000, 0x0101ff010101ff00, 0x010100ffffff0000, 0x010100ffff010000,
  835. 0x010100ff00ff01ff, 0x010100ff000000ff, 0x010100ff00000101, 0x010100ff01ffff00,
  836. 0x01010000ffffff01, 0x01010000ff000100, 0x01010000ff01ff01, 0x0101000000000000,
  837. 0x01010000000100ff, 0x010100000101ff01, 0x01010001ffff0000, 0x01010001ff00ffff,
  838. 0x01010001ff010000, 0x0101000101ffffff, 0x0101000101ff01ff, 0x0101000101010101,
  839. 0x010101ffff01ffff, 0x010101ff00000000, 0x010101ff0001ff01, 0x010101ff0101ffff,
  840. 0x010101ff010101ff, 0x01010100ffffffff, 0x01010100ff000001, 0x010101000000ff00,
  841. 0x0101010001010000, 0x0101010100ff0001, 0x010101010001ff01, 0x010101010101ffff,
  842. };
  843. static const __device__ uint8_t ksigns_iq2xs[128] = {
  844. 0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15,
  845. 144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159,
  846. 160, 33, 34, 163, 36, 165, 166, 39, 40, 169, 170, 43, 172, 45, 46, 175,
  847. 48, 177, 178, 51, 180, 53, 54, 183, 184, 57, 58, 187, 60, 189, 190, 63,
  848. 192, 65, 66, 195, 68, 197, 198, 71, 72, 201, 202, 75, 204, 77, 78, 207,
  849. 80, 209, 210, 83, 212, 85, 86, 215, 216, 89, 90, 219, 92, 221, 222, 95,
  850. 96, 225, 226, 99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111,
  851. 240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
  852. };
  853. static const __device__ uint64_t ksigns64[128] = {
  854. 0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff,
  855. 0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff,
  856. 0xff000000ff000000, 0x00000000ff0000ff, 0x00000000ff00ff00, 0xff000000ff00ffff,
  857. 0x00000000ffff0000, 0xff000000ffff00ff, 0xff000000ffffff00, 0x00000000ffffffff,
  858. 0xff0000ff00000000, 0x000000ff000000ff, 0x000000ff0000ff00, 0xff0000ff0000ffff,
  859. 0x000000ff00ff0000, 0xff0000ff00ff00ff, 0xff0000ff00ffff00, 0x000000ff00ffffff,
  860. 0x000000ffff000000, 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0x000000ffff00ffff,
  861. 0xff0000ffffff0000, 0x000000ffffff00ff, 0x000000ffffffff00, 0xff0000ffffffffff,
  862. 0xff00ff0000000000, 0x0000ff00000000ff, 0x0000ff000000ff00, 0xff00ff000000ffff,
  863. 0x0000ff0000ff0000, 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0x0000ff0000ffffff,
  864. 0x0000ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 0x0000ff00ff00ffff,
  865. 0xff00ff00ffff0000, 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0xff00ff00ffffffff,
  866. 0x0000ffff00000000, 0xff00ffff000000ff, 0xff00ffff0000ff00, 0x0000ffff0000ffff,
  867. 0xff00ffff00ff0000, 0x0000ffff00ff00ff, 0x0000ffff00ffff00, 0xff00ffff00ffffff,
  868. 0xff00ffffff000000, 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0xff00ffffff00ffff,
  869. 0x0000ffffffff0000, 0xff00ffffffff00ff, 0xff00ffffffffff00, 0x0000ffffffffffff,
  870. 0xffff000000000000, 0x00ff0000000000ff, 0x00ff00000000ff00, 0xffff00000000ffff,
  871. 0x00ff000000ff0000, 0xffff000000ff00ff, 0xffff000000ffff00, 0x00ff000000ffffff,
  872. 0x00ff0000ff000000, 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0x00ff0000ff00ffff,
  873. 0xffff0000ffff0000, 0x00ff0000ffff00ff, 0x00ff0000ffffff00, 0xffff0000ffffffff,
  874. 0x00ff00ff00000000, 0xffff00ff000000ff, 0xffff00ff0000ff00, 0x00ff00ff0000ffff,
  875. 0xffff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 0xffff00ff00ffffff,
  876. 0xffff00ffff000000, 0x00ff00ffff0000ff, 0x00ff00ffff00ff00, 0xffff00ffff00ffff,
  877. 0x00ff00ffffff0000, 0xffff00ffffff00ff, 0xffff00ffffffff00, 0x00ff00ffffffffff,
  878. 0x00ffff0000000000, 0xffffff00000000ff, 0xffffff000000ff00, 0x00ffff000000ffff,
  879. 0xffffff0000ff0000, 0x00ffff0000ff00ff, 0x00ffff0000ffff00, 0xffffff0000ffffff,
  880. 0xffffff00ff000000, 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0xffffff00ff00ffff,
  881. 0x00ffff00ffff0000, 0xffffff00ffff00ff, 0xffffff00ffffff00, 0x00ffff00ffffffff,
  882. 0xffffffff00000000, 0x00ffffff000000ff, 0x00ffffff0000ff00, 0xffffffff0000ffff,
  883. 0x00ffffff00ff0000, 0xffffffff00ff00ff, 0xffffffff00ffff00, 0x00ffffff00ffffff,
  884. 0x00ffffffff000000, 0xffffffffff0000ff, 0xffffffffff00ff00, 0x00ffffffff00ffff,
  885. 0xffffffffffff0000, 0x00ffffffffff00ff, 0x00ffffffffffff00, 0xffffffffffffffff,
  886. };
  887. static const __device__ uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128};
  888. static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
  889. typedef half dfloat; // dequantize float
  890. typedef half2 dfloat2;
  891. typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
  892. typedef void (*to_fp16_cuda_t)(const void * __restrict__ x, dfloat * __restrict__ y, int k, cudaStream_t stream);
  893. typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
  894. typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
  895. typedef void (*load_tiles_cuda_t)(
  896. const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
  897. int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row);
  898. typedef float (*vec_dot_q_mul_mat_cuda_t)(
  899. const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
  900. const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k);
  901. // Utility function
  902. #if defined(USE_ROCM)
  903. #ifndef __has_builtin
  904. #define __has_builtin(x) 0
  905. #endif
  906. typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
  907. static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
  908. const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
  909. const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
  910. #if __has_builtin(__builtin_elementwise_sub_sat)
  911. const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
  912. return reinterpret_cast<const int &>(c);
  913. #else
  914. int8x4_t c;
  915. int16_t tmp;
  916. #pragma unroll
  917. for (int i = 0; i < 4; i++) {
  918. tmp = va[i] - vb[i];
  919. if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
  920. if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
  921. c[i] = tmp;
  922. }
  923. return reinterpret_cast<int &>(c);
  924. #endif // __has_builtin(__builtin_elementwise_sub_sat)
  925. }
  926. static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
  927. #if __has_builtin(__builtin_amdgcn_sdot4)
  928. c = __builtin_amdgcn_sdot4(a, b, c, false);
  929. #else
  930. const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
  931. const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
  932. c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
  933. #endif
  934. return c;
  935. }
  936. #endif // defined(USE_ROCM)
  937. static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
  938. const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
  939. int x32 = 0;
  940. x32 |= x16[0] << 0;
  941. x32 |= x16[1] << 16;
  942. return x32;
  943. }
  944. static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
  945. const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
  946. int x32 = 0;
  947. x32 |= x16[0] << 0;
  948. x32 |= x16[1] << 16;
  949. return x32;
  950. }
  951. static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
  952. return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
  953. }
  954. static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
  955. return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
  956. }
  957. // Dequant functions
  958. static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
  959. const block_q4_0 * x = (const block_q4_0 *) vx;
  960. const dfloat d = x[ib].d;
  961. const int vui = x[ib].qs[iqs];
  962. v.x = __int2half_rn(vui & 0xF);
  963. v.y = __int2half_rn(vui >> 4);
  964. v = __hsub2(v, __floats2half2_rn(8.0f, 8.0f));
  965. v = __hmul2(v, {d, d});
  966. }
  967. static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
  968. const block_q4_1 * x = (const block_q4_1 *) vx;
  969. const dfloat d = __low2half(x[ib].dm);
  970. const dfloat m = __high2half(x[ib].dm);
  971. const int vui = x[ib].qs[iqs];
  972. v.x = __int2half_rn(vui & 0xF);
  973. v.y = __int2half_rn(vui >> 4);
  974. v = __hmul2(v, {d, d});
  975. v = __hadd2(v, {m, m});
  976. }
  977. static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
  978. const block_q5_0 * x = (const block_q5_0 *) vx;
  979. const dfloat d = x[ib].d;
  980. uint32_t qh;
  981. memcpy(&qh, x[ib].qh, sizeof(qh));
  982. const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
  983. const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
  984. v.x = __int2half_rn((x[ib].qs[iqs] & 0xf) | xh_0);
  985. v.y = __int2half_rn((x[ib].qs[iqs] >> 4) | xh_1);
  986. v = __hsub2(v, __floats2half2_rn(16.0f, 16.0f));
  987. v = __hmul2(v, {d, d});
  988. }
  989. static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
  990. const block_q5_1 * x = (const block_q5_1 *) vx;
  991. const dfloat d = __low2half(x[ib].dm);
  992. const dfloat m = __high2half(x[ib].dm);
  993. uint32_t qh;
  994. memcpy(&qh, x[ib].qh, sizeof(qh));
  995. const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
  996. const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
  997. v.x = __int2half_rn((x[ib].qs[iqs] & 0xf) | xh_0);
  998. v.y = __int2half_rn((x[ib].qs[iqs] >> 4) | xh_1);
  999. v = __hmul2(v, {d, d});
  1000. v = __hadd2(v, {m, m});
  1001. }
  1002. static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
  1003. const block_q8_0 * x = (const block_q8_0 *) vx;
  1004. const dfloat d = x[ib].d;
  1005. v.x = __int2half_rn(x[ib].qs[iqs + 0]);
  1006. v.y = __int2half_rn(x[ib].qs[iqs + 1]);
  1007. v = __hmul2(v, {d, d});
  1008. }
  1009. template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
  1010. static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
  1011. const int i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
  1012. if (i >= k) {
  1013. return;
  1014. }
  1015. const int ib = i/qk; // block index
  1016. const int iqs = (i%qk)/qr; // quant index
  1017. const int iybs = i - i%qk; // y block start index
  1018. const int y_offset = qr == 1 ? 1 : qk/2;
  1019. // dequantize
  1020. dfloat2 v;
  1021. dequantize_kernel(vx, ib, iqs, v);
  1022. y[iybs + iqs + 0] = v.x;
  1023. y[iybs + iqs + y_offset] = v.y;
  1024. }
  1025. template<typename dst_t>
  1026. static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
  1027. const int i = blockIdx.x;
  1028. const block_q2_K * x = (const block_q2_K *) vx;
  1029. const int tid = threadIdx.x;
  1030. const int n = tid/32;
  1031. const int l = tid - 32*n;
  1032. const int is = 8*n + l/16;
  1033. const uint8_t q = x[i].qs[32*n + l];
  1034. dst_t * y = yy + i*QK_K + 128*n;
  1035. half dall = __low2half(x[i].dm);
  1036. half dmin = __high2half(x[i].dm);
  1037. y[l+ 0] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+0] & 0xF) * ((q >> 0) & 3))), __hmul(dmin, __int2half_rn(x[i].scales[is+0] >> 4)));
  1038. y[l+32] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+2] & 0xF) * ((q >> 2) & 3))), __hmul(dmin, __int2half_rn(x[i].scales[is+2] >> 4)));
  1039. y[l+64] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+4] & 0xF) * ((q >> 4) & 3))), __hmul(dmin, __int2half_rn(x[i].scales[is+4] >> 4)));
  1040. y[l+96] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+6] & 0xF) * ((q >> 6) & 3))), __hmul(dmin, __int2half_rn(x[i].scales[is+6] >> 4)));
  1041. }
  1042. template<typename dst_t>
  1043. static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
  1044. const int i = blockIdx.x;
  1045. const block_q3_K * x = (const block_q3_K *) vx;
  1046. const int r = threadIdx.x/4;
  1047. const int tid = r/2;
  1048. const int is0 = r%2;
  1049. const int l0 = 16*is0 + 4*(threadIdx.x%4);
  1050. const int n = tid / 4;
  1051. const int j = tid - 4*n;
  1052. uint8_t m = 1 << (4*n + j);
  1053. int is = 8*n + 2*j + is0;
  1054. int shift = 2*j;
  1055. int8_t us = is < 4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
  1056. is < 8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) :
  1057. is < 12 ? (x[i].scales[is-8] >> 4) | (((x[i].scales[is+0] >> 4) & 3) << 4) :
  1058. (x[i].scales[is-8] >> 4) | (((x[i].scales[is-4] >> 6) & 3) << 4);
  1059. half d_all = x[i].d;
  1060. half dl = __hmul(d_all, __int2half_rn(us - 32));
  1061. dst_t * y = yy + i*QK_K + 128*n + 32*j;
  1062. const uint8_t * q = x[i].qs + 32*n;
  1063. const uint8_t * hm = x[i].hmask;
  1064. for (int l = l0; l < l0+4; ++l) y[l] = __hmul(dl, __int2half_rn((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4)));
  1065. }
  1066. static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
  1067. if (j < 4) {
  1068. d = q[j] & 63; m = q[j + 4] & 63;
  1069. } else {
  1070. d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
  1071. m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
  1072. }
  1073. }
  1074. template<typename dst_t>
  1075. static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
  1076. const block_q4_K * x = (const block_q4_K *) vx;
  1077. const int i = blockIdx.x;
  1078. // assume 32 threads
  1079. const int tid = threadIdx.x;
  1080. const int il = tid/8;
  1081. const int ir = tid%8;
  1082. const int is = 2*il;
  1083. const int n = 4;
  1084. dst_t * y = yy + i*QK_K + 64*il + n*ir;
  1085. const half dall = __low2half(x[i].dm);
  1086. const half dmin = __high2half(x[i].dm);
  1087. const uint8_t * q = x[i].qs + 32*il + n*ir;
  1088. uint8_t sc, m;
  1089. get_scale_min_k4(is + 0, x[i].scales, sc, m);
  1090. const half d1 = __hmul(dall, __int2half_rn(sc));
  1091. const half m1 = __hmul(dmin, __int2half_rn(m));
  1092. get_scale_min_k4(is + 1, x[i].scales, sc, m);
  1093. const half d2 = __hmul(dall, __int2half_rn(sc));
  1094. const half m2 = __hmul(dmin, __int2half_rn(m));
  1095. for (int l = 0; l < n; ++l) {
  1096. y[l + 0] = __hsub(__hmul(d1, __int2half_rn(q[l] & 0xF)), m1);
  1097. y[l +32] = __hsub(__hmul(d2, __int2half_rn(q[l] >> 4)), m2);
  1098. }
  1099. }
  1100. template<typename dst_t>
  1101. static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
  1102. const block_q5_K * x = (const block_q5_K *) vx;
  1103. const int i = blockIdx.x;
  1104. // assume 64 threads - this is very slightly better than the one below
  1105. const int tid = threadIdx.x;
  1106. const int il = tid/16; // il is in 0...3
  1107. const int ir = tid%16; // ir is in 0...15
  1108. const int is = 2*il; // is is in 0...6
  1109. dst_t * y = yy + i*QK_K + 64*il + 2*ir;
  1110. const half dall = __low2half(x[i].dm);
  1111. const half dmin = __high2half(x[i].dm);
  1112. const uint8_t * ql = x[i].qs + 32*il + 2*ir;
  1113. const uint8_t * qh = x[i].qh + 2*ir;
  1114. uint8_t sc, m;
  1115. get_scale_min_k4(is + 0, x[i].scales, sc, m);
  1116. const half d1 = __hmul(dall, __int2half_rn(sc)); const half m1 = __hmul(dmin, __int2half_rn(m));
  1117. get_scale_min_k4(is + 1, x[i].scales, sc, m);
  1118. const half d2 = __hmul(dall, __int2half_rn(sc)); const half m2 = __hmul(dmin, __int2half_rn(m));
  1119. uint8_t hm = 1 << (2*il);
  1120. y[ 0] = __hsub(__hmul(d1, __int2half_rn((ql[0] & 0xF) + (qh[0] & hm ? 16 : 0))), m1);
  1121. y[ 1] = __hsub(__hmul(d1, __int2half_rn((ql[1] & 0xF) + (qh[1] & hm ? 16 : 0))), m1);
  1122. hm <<= 1;
  1123. y[32] = __hsub(__hmul(d2, __int2half_rn((ql[0] >> 4) + (qh[0] & hm ? 16 : 0))), m2);
  1124. y[33] = __hsub(__hmul(d2, __int2half_rn((ql[1] >> 4) + (qh[1] & hm ? 16 : 0))), m2);
  1125. }
  1126. template<typename dst_t>
  1127. static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
  1128. const block_q6_K * x = (const block_q6_K *) vx;
  1129. const int i = blockIdx.x;
  1130. // assume 64 threads - this is very slightly better than the one below
  1131. const int tid = threadIdx.x;
  1132. const int ip = tid/32; // ip is 0 or 1
  1133. const int il = tid - 32*ip; // 0...32
  1134. const int is = 8*ip + il/16;
  1135. dst_t * y = yy + i*QK_K + 128*ip + il;
  1136. const half d = x[i].d;
  1137. const uint8_t * ql = x[i].ql + 64*ip + il;
  1138. const uint8_t qh = x[i].qh[32*ip + il];
  1139. const int8_t * sc = x[i].scales + is;
  1140. y[ 0] = __hmul(d, __int2half_rn(sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32)));
  1141. y[32] = __hmul(d, __int2half_rn(sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32)));
  1142. y[64] = __hmul(d, __int2half_rn(sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32)));
  1143. y[96] = __hmul(d, __int2half_rn(sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32)));
  1144. }
  1145. template<typename dst_t>
  1146. static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
  1147. const int i = blockIdx.x;
  1148. const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
  1149. const int tid = threadIdx.x;
  1150. const int il = tid/8; // 0...3
  1151. const int ib = tid%8; // 0...7
  1152. dst_t * y = yy + i*QK_K + 32*ib + 8*il;
  1153. const uint16_t * q2 = x[i].qs + 4*ib;
  1154. const uint8_t * aux8 = (const uint8_t *)q2;
  1155. const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[il]);
  1156. const uint32_t aux32 = q2[2] | (q2[3] << 16);
  1157. const float d = __half2float(x[i].d) * (0.5f + (aux32 >> 28)) * 0.25f;
  1158. const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
  1159. for (int j = 0; j < 8; ++j) y[j] = __float2half(d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f));
  1160. }
  1161. template<typename dst_t>
  1162. static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
  1163. const int i = blockIdx.x;
  1164. const block_iq2_xs * x = (const block_iq2_xs *) vx;
  1165. const int tid = threadIdx.x;
  1166. const int il = tid/8; // 0...3
  1167. const int ib = tid%8; // 0...7
  1168. dst_t * y = yy + i*QK_K + 32*ib + 8*il;
  1169. const uint16_t * q2 = x[i].qs + 4*ib;
  1170. const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511));
  1171. const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
  1172. const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
  1173. for (int j = 0; j < 8; ++j) y[j] = __float2half(d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f));
  1174. }
  1175. template<typename dst_t>
  1176. static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
  1177. const int i = blockIdx.x;
  1178. const block_iq2_s * x = (const block_iq2_s *) vx;
  1179. const int tid = threadIdx.x;
  1180. const int il = tid/8; // 0...3
  1181. const int ib = tid%8; // 0...7
  1182. dst_t * y = yy + i*QK_K + 32*ib + 8*il;
  1183. const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
  1184. const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
  1185. const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
  1186. for (int j = 0; j < 8; ++j) y[j] = __float2half(d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f));
  1187. }
  1188. template<typename dst_t>
  1189. static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
  1190. const int i = blockIdx.x;
  1191. const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
  1192. const int tid = threadIdx.x;
  1193. const int il = tid/8; // 0...3
  1194. const int ib = tid%8; // 0...7
  1195. dst_t * y = yy + i*QK_K + 32*ib + 8*il;
  1196. const uint8_t * q3 = x[i].qs + 8*ib;
  1197. const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib;
  1198. const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*il+0]);
  1199. const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*il+1]);
  1200. const uint32_t aux32 = gas[0] | (gas[1] << 16);
  1201. const float d = __half2float(x[i].d) * (0.5f + (aux32 >> 28)) * 0.5f;
  1202. const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
  1203. for (int j = 0; j < 4; ++j) {
  1204. y[j+0] = __float2half(d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f));
  1205. y[j+4] = __float2half(d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f));
  1206. }
  1207. }
  1208. template<typename dst_t>
  1209. static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
  1210. const int i = blockIdx.x;
  1211. const block_iq3_s * x = (const block_iq3_s *) vx;
  1212. const int tid = threadIdx.x;
  1213. const int il = tid/8; // 0...3
  1214. const int ib = tid%8; // 0...7
  1215. dst_t * y = yy + i*QK_K + 32*ib + 8*il;
  1216. const uint8_t * qs = x[i].qs + 8*ib;
  1217. const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)));
  1218. const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256)));
  1219. const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf)) * 0.5f;
  1220. const uint8_t signs = x[i].signs[4*ib + il];
  1221. for (int j = 0; j < 4; ++j) {
  1222. y[j+0] = __float2half(d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f));
  1223. y[j+4] = __float2half(d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f));
  1224. }
  1225. }
  1226. template<typename dst_t>
  1227. static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
  1228. const int i = blockIdx.x;
  1229. const block_iq1_s * x = (const block_iq1_s *) vx;
  1230. const int tid = threadIdx.x;
  1231. const int il = tid/8; // 0...3
  1232. const int ib = tid%8; // 0...7
  1233. dst_t * y = yy + i*QK_K + 32*ib + 8*il;
  1234. const int i8 = 4*ib+il;
  1235. uint8_t h = x[i].scales[i8/2] >> 4*(i8%2);
  1236. const int8_t * grid = (const int8_t *)(iq1s_grid + (x[i].qs[i8] | ((h & 8) << 5)));
  1237. const float d = __half2float(x[i].d) * (2*(h & 7) + 1);
  1238. for (int j = 0; j < 8; ++j) y[j] = __float2half(d * grid[j]);
  1239. }
  1240. template<typename dst_t>
  1241. static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {
  1242. const int i = blockIdx.x;
  1243. const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);
  1244. const int tid = threadIdx.x;
  1245. const int il = tid/8; // 0...3
  1246. const int ib = tid%8; // 0...7
  1247. dst_t * y = yy + i*QK_K + 32*ib + 4*il;
  1248. const uint8_t * q4 = x[ib].qs + 4*il;
  1249. const float d = __half2float(x[ib].d);
  1250. for (int j = 0; j < 4; ++j) {
  1251. y[j+ 0] = __float2half(d * kvalues_iq4nl[q4[j] & 0xf]);
  1252. y[j+16] = __float2half(d * kvalues_iq4nl[q4[j] >> 4]);
  1253. }
  1254. }
  1255. template<typename dst_t>
  1256. static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
  1257. const int i = blockIdx.x;
  1258. const block_iq4_xs * x = (const block_iq4_xs *)vx;
  1259. const int tid = threadIdx.x;
  1260. const int il = tid/8; // 0...3
  1261. const int ib = tid%8; // 0...7
  1262. dst_t * y = yy + i*QK_K + 32*ib + 4*il;
  1263. const uint8_t * q4 = x[i].qs + 16*ib + 4*il;
  1264. const float d = __half2float(x[i].d) * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
  1265. for (int j = 0; j < 4; ++j) {
  1266. y[j+ 0] = __float2half(d * kvalues_iq4nl[q4[j] & 0xf]);
  1267. y[j+16] = __float2half(d * kvalues_iq4nl[q4[j] >> 4]);
  1268. }
  1269. }
  1270. template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
  1271. static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
  1272. const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE);
  1273. dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
  1274. }
  1275. template<typename dst_t>
  1276. static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
  1277. const int nb = k / QK_K;
  1278. dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
  1279. }
  1280. template<typename dst_t>
  1281. static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
  1282. const int nb = k / QK_K;
  1283. dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
  1284. }
  1285. template<typename dst_t>
  1286. static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
  1287. const int nb = k / QK_K;
  1288. dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
  1289. }
  1290. template<typename dst_t>
  1291. static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
  1292. const int nb = k / QK_K;
  1293. dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
  1294. }
  1295. template<typename dst_t>
  1296. static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
  1297. const int nb = k / QK_K;
  1298. dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
  1299. }
  1300. template<typename dst_t>
  1301. static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
  1302. const int nb = k / QK_K;
  1303. dequantize_block_iq2_xxs<<<nb, 32, 0, stream>>>(vx, y);
  1304. }
  1305. template<typename dst_t>
  1306. static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
  1307. const int nb = k / QK_K;
  1308. dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
  1309. }
  1310. template<typename dst_t>
  1311. static void dequantize_row_iq2_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
  1312. const int nb = k / QK_K;
  1313. dequantize_block_iq2_s<<<nb, 32, 0, stream>>>(vx, y);
  1314. }
  1315. template<typename dst_t>
  1316. static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
  1317. const int nb = k / QK_K;
  1318. dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
  1319. }
  1320. template<typename dst_t>
  1321. static void dequantize_row_iq3_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
  1322. const int nb = k / QK_K;
  1323. dequantize_block_iq3_s<<<nb, 32, 0, stream>>>(vx, y);
  1324. }
  1325. template<typename dst_t>
  1326. static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
  1327. const int nb = k / QK_K;
  1328. dequantize_block_iq1_s<<<nb, 32, 0, stream>>>(vx, y);
  1329. }
  1330. template<typename dst_t>
  1331. static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
  1332. const int nb = (k + QK_K - 1) / QK_K;
  1333. dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
  1334. }
  1335. template<typename dst_t>
  1336. static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
  1337. const int nb = (k + QK_K - 1) / QK_K;
  1338. dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
  1339. }
  1340. static to_fp16_cuda_t ggml_get_to_fp16_cuda(int type) {
  1341. switch (type) {
  1342. case 2:
  1343. return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
  1344. case 3:
  1345. return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
  1346. case 6:
  1347. return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
  1348. case 7:
  1349. return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
  1350. case 8:
  1351. return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
  1352. case 10:
  1353. return dequantize_row_q2_K_cuda;
  1354. case 11:
  1355. return dequantize_row_q3_K_cuda;
  1356. case 12:
  1357. return dequantize_row_q4_K_cuda;
  1358. case 13:
  1359. return dequantize_row_q5_K_cuda;
  1360. case 14:
  1361. return dequantize_row_q6_K_cuda;
  1362. case 16:
  1363. return dequantize_row_iq2_xxs_cuda;
  1364. case 17:
  1365. return dequantize_row_iq2_xs_cuda;
  1366. case 18:
  1367. return dequantize_row_iq3_xxs_cuda;
  1368. case 19:
  1369. return dequantize_row_iq1_s_cuda;
  1370. case 20:
  1371. return dequantize_row_iq4_nl_cuda;
  1372. case 21:
  1373. return dequantize_row_iq3_s_cuda;
  1374. case 22:
  1375. return dequantize_row_iq2_s_cuda;
  1376. case 23:
  1377. return dequantize_row_iq4_xs_cuda;
  1378. default:
  1379. return nullptr;
  1380. }
  1381. }
  1382. // GEMV
  1383. template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
  1384. static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, dfloat * __restrict__ dst, const int ncols, const int nrows) {
  1385. // qk = quantized weights per x block
  1386. // qr = number of quantized weights per data value in x block
  1387. const int row = blockIdx.x*blockDim.y + threadIdx.y;
  1388. if (row >= nrows) {
  1389. return;
  1390. }
  1391. const int tid = threadIdx.x;
  1392. const int iter_stride = 2*GGML_CUDA_DMMV_X;
  1393. const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
  1394. const int y_offset = qr == 1 ? 1 : qk/2;
  1395. half2 tmp = __floats2half2_rn(0.0f, 0.0f); // two sums for f16 to take advantage of half2 intrinsics
  1396. for (int i = 0; i < ncols; i += iter_stride) {
  1397. const int col = i + vals_per_iter*tid;
  1398. const int ib = (row*ncols + col)/qk; // x block index
  1399. const int iqs = (col%qk)/qr; // x quant index
  1400. const int iybs = col - col%qk; // y block start index
  1401. // processing >2 values per i iter is faster for fast GPUs
  1402. #pragma unroll
  1403. for (int j = 0; j < vals_per_iter; j += 2) {
  1404. // process 2 vals per j iter
  1405. // dequantize
  1406. // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
  1407. dfloat2 v;
  1408. dequantize_kernel(vx, ib, iqs + j/qr, v);
  1409. // matrix multiplication
  1410. // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
  1411. tmp = __hadd2(tmp, __hmul2(v, {
  1412. y[iybs + iqs + j/qr + 0],
  1413. y[iybs + iqs + j/qr + y_offset]
  1414. }));
  1415. }
  1416. }
  1417. // sum up partial sums and write back result
  1418. #pragma unroll
  1419. for (int mask = 16; mask > 0; mask >>= 1) {
  1420. tmp = __hadd2(tmp, __shfl_xor_sync(0xffffffff, tmp, mask, 32));
  1421. }
  1422. if (tid == 0) {
  1423. dst[row] = __hadd(tmp.x, tmp.y);
  1424. }
  1425. }
  1426. static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const dfloat * __restrict__ yy, dfloat * __restrict__ dst, const int ncols, int nrows) {
  1427. static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
  1428. const int row = blockIdx.x*blockDim.y + threadIdx.y;
  1429. if (row > nrows) return;
  1430. const int num_blocks_per_row = ncols / QK_K;
  1431. const int ib0 = row*num_blocks_per_row;
  1432. const block_q2_K * x = (const block_q2_K *)vx + ib0;
  1433. float tmp = 0; // partial sum for thread in warp
  1434. const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15
  1435. const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
  1436. const int step = 16/K_QUANTS_PER_ITERATION;
  1437. const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
  1438. const int in = tid - step*im; // 0...15 or 0...7
  1439. const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2
  1440. const int q_offset = 32*im + l0;
  1441. const int s_offset = 8*im;
  1442. const int y_offset = 128*im + l0;
  1443. uint32_t aux[4];
  1444. const uint8_t * d = (const uint8_t *)aux;
  1445. const uint8_t * m = (const uint8_t *)(aux + 2);
  1446. for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
  1447. const half * y = yy + i * QK_K + y_offset;
  1448. const uint8_t * q = x[i].qs + q_offset;
  1449. const float dall = __low2float(x[i].dm);
  1450. const float dmin = __high2float(x[i].dm);
  1451. const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
  1452. aux[0] = a[0] & 0x0f0f0f0f;
  1453. aux[1] = a[1] & 0x0f0f0f0f;
  1454. aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
  1455. aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
  1456. float sum1 = 0, sum2 = 0;
  1457. for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
  1458. sum1 += __half2float(y[l+ 0]) * d[0] * ((q[l+ 0] >> 0) & 3)
  1459. + __half2float(y[l+32]) * d[2] * ((q[l+ 0] >> 2) & 3)
  1460. + __half2float(y[l+64]) * d[4] * ((q[l+ 0] >> 4) & 3)
  1461. + __half2float(y[l+96]) * d[6] * ((q[l+ 0] >> 6) & 3)
  1462. + __half2float(y[l+16]) * d[1] * ((q[l+16] >> 0) & 3)
  1463. + __half2float(y[l+48]) * d[3] * ((q[l+16] >> 2) & 3)
  1464. + __half2float(y[l+80]) * d[5] * ((q[l+16] >> 4) & 3)
  1465. +__half2float(y[l+112]) * d[7] * ((q[l+16] >> 6) & 3);
  1466. sum2 += __half2float(y[l+ 0]) * m[0] + __half2float(y[l+32]) * m[2] + __half2float(y[l+64]) * m[4] + __half2float(y[ l+96]) * m[6]
  1467. + __half2float(y[l+16]) * m[1] + __half2float(y[l+48]) * m[3] + __half2float(y[l+80]) * m[5] + __half2float(y[l+112]) * m[7];
  1468. }
  1469. tmp += dall * sum1 - dmin * sum2;
  1470. }
  1471. // sum up partial sums and write back result
  1472. #pragma unroll
  1473. for (int mask = 16; mask > 0; mask >>= 1) {
  1474. tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
  1475. }
  1476. if (threadIdx.x == 0) {
  1477. dst[row] = __float2half(tmp);
  1478. }
  1479. }
  1480. static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const dfloat * __restrict__ yy, dfloat * __restrict__ dst, const int ncols, int nrows) {
  1481. const int row = blockIdx.x*blockDim.y + threadIdx.y;
  1482. if (row > nrows) return;
  1483. const int num_blocks_per_row = ncols / QK_K;
  1484. const int ib0 = row*num_blocks_per_row;
  1485. const block_q3_K * x = (const block_q3_K *)vx + ib0;
  1486. float tmp = 0; // partial sum for thread in warp
  1487. const uint16_t kmask1 = 0x0303;
  1488. const uint16_t kmask2 = 0x0f0f;
  1489. const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
  1490. const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
  1491. const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop
  1492. const int step = 16/K_QUANTS_PER_ITERATION;
  1493. const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
  1494. const int in = tid - step*im; // 0....15 or 0...7
  1495. const uint8_t m = 1 << (4*im);
  1496. const int l0 = n*in; // 0...15 or 0...14 in steps of 2
  1497. const int q_offset = 32*im + l0;
  1498. const int y_offset = 128*im + l0;
  1499. uint16_t utmp[4];
  1500. const int8_t * s = (const int8_t *)utmp;
  1501. const uint16_t s_shift = 4*im;
  1502. for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
  1503. const half * y = yy + i * QK_K + y_offset;
  1504. const uint8_t * q = x[i].qs + q_offset;
  1505. const uint8_t * h = x[i].hmask + l0;
  1506. const uint16_t * a = (const uint16_t *)x[i].scales;
  1507. utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
  1508. utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
  1509. utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
  1510. utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
  1511. const float d = __half2float(x[i].d);
  1512. float sum = 0;
  1513. for (int l = 0; l < n; ++l) {
  1514. sum += __half2float(y[l+ 0]) * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
  1515. + __half2float(y[l+32]) * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
  1516. + __half2float(y[l+64]) * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
  1517. + __half2float(y[l+96]) * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
  1518. sum += __half2float(y[l+16]) * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
  1519. + __half2float(y[l+48]) * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
  1520. + __half2float(y[l+80]) * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
  1521. + __half2float(y[l+112]) * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
  1522. }
  1523. tmp += d * sum;
  1524. }
  1525. // sum up partial sums and write back result
  1526. #pragma unroll
  1527. for (int mask = 16; mask > 0; mask >>= 1) {
  1528. tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
  1529. }
  1530. if (threadIdx.x == 0) {
  1531. dst[row] = __float2half(tmp);
  1532. }
  1533. }
  1534. static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const dfloat * __restrict__ yy, dfloat * __restrict__ dst, const int ncols, int nrows) {
  1535. const int row = blockIdx.x*blockDim.y + threadIdx.y;
  1536. if (row > nrows) return;
  1537. const int num_blocks_per_row = ncols / QK_K;
  1538. const int ib0 = row*num_blocks_per_row;
  1539. const block_q4_K * x = (const block_q4_K *)vx + ib0;
  1540. const uint16_t kmask1 = 0x3f3f;
  1541. const uint16_t kmask2 = 0x0f0f;
  1542. const uint16_t kmask3 = 0xc0c0;
  1543. const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
  1544. const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
  1545. const int step = 8/K_QUANTS_PER_ITERATION; // 8 or 4
  1546. const int il = tid/step; // 0...3
  1547. const int ir = tid - step*il; // 0...7 or 0...3
  1548. const int n = 2 * K_QUANTS_PER_ITERATION; // 2 or 4
  1549. const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
  1550. const int in = il%2;
  1551. const int l0 = n*(2*ir + in);
  1552. const int q_offset = 32*im + l0;
  1553. const int y_offset = 64*im + l0;
  1554. uint16_t aux[4];
  1555. const uint8_t * sc = (const uint8_t *)aux;
  1556. #if K_QUANTS_PER_ITERATION == 2
  1557. uint32_t q32[4];
  1558. const uint8_t * q4 = (const uint8_t *)q32;
  1559. #else
  1560. uint16_t q16[4];
  1561. const uint8_t * q4 = (const uint8_t *)q16;
  1562. #endif
  1563. float tmp = 0; // partial sum for thread in warp
  1564. for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
  1565. const half * y1 = yy + i*QK_K + y_offset;
  1566. const half * y2 = y1 + 128;
  1567. const float dall = __low2float(x[i].dm);
  1568. const float dmin = __high2float(x[i].dm);
  1569. const uint16_t * a = (const uint16_t *)x[i].scales;
  1570. aux[0] = a[im+0] & kmask1;
  1571. aux[1] = a[im+2] & kmask1;
  1572. aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
  1573. aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
  1574. #if K_QUANTS_PER_ITERATION == 2
  1575. const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
  1576. const uint32_t * q2 = q1 + 16;
  1577. q32[0] = q1[0] & 0x0f0f0f0f;
  1578. q32[1] = q1[0] & 0xf0f0f0f0;
  1579. q32[2] = q2[0] & 0x0f0f0f0f;
  1580. q32[3] = q2[0] & 0xf0f0f0f0;
  1581. float4 s = {0.f, 0.f, 0.f, 0.f};
  1582. float smin = 0;
  1583. for (int l = 0; l < 4; ++l) {
  1584. s.x += __half2float(y1[l]) * q4[l+0]; s.y += __half2float(y1[l+32]) * q4[l+ 4];
  1585. s.z += __half2float(y2[l]) * q4[l+8]; s.w += __half2float(y2[l+32]) * q4[l+12];
  1586. smin += __half2float(y1[l]) * sc[2] + __half2float(y1[l+32]) * sc[3] + __half2float(y2[l]) * sc[6] + __half2float(y2[l+32]) * sc[7];
  1587. }
  1588. tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
  1589. #else
  1590. const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
  1591. const uint16_t * q2 = q1 + 32;
  1592. q16[0] = q1[0] & 0x0f0f;
  1593. q16[1] = q1[0] & 0xf0f0;
  1594. q16[2] = q2[0] & 0x0f0f;
  1595. q16[3] = q2[0] & 0xf0f0;
  1596. float4 s = {0.f, 0.f, 0.f, 0.f};
  1597. float smin = 0;
  1598. for (int l = 0; l < 2; ++l) {
  1599. s.x += __half2float(y1[l]) * q4[l+0]; s.y += __half2float(y1[l+32]) * q4[l+2];
  1600. s.z += __half2float(y2[l]) * q4[l+4]; s.w += __half2float(y2[l+32]) * q4[l+6];
  1601. smin += __half2float(y1[l]) * sc[2] + __half2float(y1[l+32]) * sc[3] + __half2float(y2[l]) * sc[6] + __half2float(y2[l+32]) * sc[7];
  1602. }
  1603. tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
  1604. #endif
  1605. }
  1606. // sum up partial sums and write back result
  1607. #pragma unroll
  1608. for (int mask = 16; mask > 0; mask >>= 1) {
  1609. tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
  1610. }
  1611. if (tid == 0) {
  1612. dst[row] = __float2half(tmp);
  1613. }
  1614. }
  1615. static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const dfloat * __restrict__ yy, dfloat * __restrict__ dst, const int ncols) {
  1616. const int row = blockIdx.x;
  1617. const int num_blocks_per_row = ncols / QK_K;
  1618. const int ib0 = row*num_blocks_per_row;
  1619. const block_q5_K * x = (const block_q5_K *)vx + ib0;
  1620. float tmp = 0; // partial sum for thread in warp
  1621. const uint16_t kmask1 = 0x3f3f;
  1622. const uint16_t kmask2 = 0x0f0f;
  1623. const uint16_t kmask3 = 0xc0c0;
  1624. const int tid = threadIdx.x/2; // 0...15
  1625. const int ix = threadIdx.x%2;
  1626. const int il = tid/4; // 0...3
  1627. const int ir = tid - 4*il;// 0...3
  1628. const int n = 2;
  1629. const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
  1630. const int in = il%2;
  1631. const int l0 = n*(2*ir + in);
  1632. const int q_offset = 32*im + l0;
  1633. const int y_offset = 64*im + l0;
  1634. const uint8_t hm1 = 1 << (2*im);
  1635. const uint8_t hm2 = hm1 << 4;
  1636. uint16_t aux[4];
  1637. const uint8_t * sc = (const uint8_t *)aux;
  1638. uint16_t q16[8];
  1639. const uint8_t * q4 = (const uint8_t *)q16;
  1640. for (int i = ix; i < num_blocks_per_row; i += 2) {
  1641. const uint8_t * ql1 = x[i].qs + q_offset;
  1642. const uint8_t * qh = x[i].qh + l0;
  1643. const half * y1 = yy + i*QK_K + y_offset;
  1644. const half * y2 = y1 + 128;
  1645. const float dall = __low2float(x[i].dm);
  1646. const float dmin = __high2float(x[i].dm);
  1647. const uint16_t * a = (const uint16_t *)x[i].scales;
  1648. aux[0] = a[im+0] & kmask1;
  1649. aux[1] = a[im+2] & kmask1;
  1650. aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
  1651. aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
  1652. float4 sum = {0.f, 0.f, 0.f, 0.f};
  1653. float smin = 0;
  1654. const uint16_t * q1 = (const uint16_t *)ql1;
  1655. const uint16_t * q2 = q1 + 32;
  1656. q16[0] = q1[0] & 0x0f0f;
  1657. q16[1] = q1[8] & 0x0f0f;
  1658. q16[2] = (q1[0] >> 4) & 0x0f0f;
  1659. q16[3] = (q1[8] >> 4) & 0x0f0f;
  1660. q16[4] = q2[0] & 0x0f0f;
  1661. q16[5] = q2[8] & 0x0f0f;
  1662. q16[6] = (q2[0] >> 4) & 0x0f0f;
  1663. q16[7] = (q2[8] >> 4) & 0x0f0f;
  1664. for (int l = 0; l < n; ++l) {
  1665. sum.x += __half2float(y1[l+ 0]) * (q4[l +0] + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
  1666. + __half2float(y1[l+16]) * (q4[l +2] + (qh[l+16] & (hm1 << 0) ? 16 : 0));
  1667. sum.y += __half2float(y1[l+32]) * (q4[l +4] + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
  1668. + __half2float(y1[l+48]) * (q4[l +6] + (qh[l+16] & (hm1 << 1) ? 16 : 0));
  1669. sum.z += __half2float(y2[l+ 0]) * (q4[l +8] + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
  1670. + __half2float(y2[l+16]) * (q4[l+10] + (qh[l+16] & (hm2 << 0) ? 16 : 0));
  1671. sum.w += __half2float(y2[l+32]) * (q4[l+12] + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
  1672. + __half2float(y2[l+48]) * (q4[l+14] + (qh[l+16] & (hm2 << 1) ? 16 : 0));
  1673. smin += (__half2float(y1[l]) + __half2float(y1[l+16])) * sc[2] + (__half2float(y1[l+32]) + __half2float(y1[l+48])) * sc[3]
  1674. + (__half2float(y2[l]) + __half2float(y2[l+16])) * sc[6] + (__half2float(y2[l+32]) + __half2float(y2[l+48])) * sc[7];
  1675. }
  1676. tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
  1677. }
  1678. // sum up partial sums and write back result
  1679. #pragma unroll
  1680. for (int mask = 16; mask > 0; mask >>= 1) {
  1681. tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
  1682. }
  1683. if (threadIdx.x == 0) {
  1684. dst[row] = __float2half(tmp);
  1685. }
  1686. }
  1687. static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const dfloat * __restrict__ yy, dfloat * __restrict__ dst, const int ncols, int nrows) {
  1688. static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
  1689. const int row = blockIdx.x*blockDim.y + threadIdx.y;
  1690. if (row > nrows) return;
  1691. const int num_blocks_per_row = ncols / QK_K;
  1692. const int ib0 = row*num_blocks_per_row;
  1693. const block_q6_K * x = (const block_q6_K *)vx + ib0;
  1694. const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
  1695. const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1
  1696. const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8
  1697. const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
  1698. const int in = tid - step*im; // 0...15 or 0...7
  1699. #if K_QUANTS_PER_ITERATION == 1
  1700. const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
  1701. const int is = 0;
  1702. #else
  1703. const int l0 = 4 * in; // 0, 4, 8, ..., 28
  1704. const int is = in / 4;
  1705. #endif
  1706. const int ql_offset = 64*im + l0;
  1707. const int qh_offset = 32*im + l0;
  1708. const int s_offset = 8*im + is;
  1709. const int y_offset = 128*im + l0;
  1710. float tmp = 0; // partial sum for thread in warp
  1711. for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
  1712. const half * y = yy + i * QK_K + y_offset;
  1713. const uint8_t * ql = x[i].ql + ql_offset;
  1714. const uint8_t * qh = x[i].qh + qh_offset;
  1715. const int8_t * s = x[i].scales + s_offset;
  1716. const float d = __half2float(x[i].d);
  1717. #if K_QUANTS_PER_ITERATION == 1
  1718. float sum = __half2float(y[ 0]) * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
  1719. + __half2float(y[16]) * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
  1720. + __half2float(y[32]) * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
  1721. + __half2float(y[48]) * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
  1722. + __half2float(y[64]) * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
  1723. + __half2float(y[80]) * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
  1724. + __half2float(y[96]) * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
  1725. +__half2float(y[112]) * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
  1726. tmp += sum;
  1727. #else
  1728. float sum = 0;
  1729. for (int l = 0; l < 4; ++l) {
  1730. sum += __half2float(y[l+ 0]) * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
  1731. + __half2float(y[l+32]) * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
  1732. + __half2float(y[l+64]) * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
  1733. + __half2float(y[l+96]) * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
  1734. }
  1735. tmp += sum;
  1736. #endif
  1737. }
  1738. // sum up partial sums and write back result
  1739. #pragma unroll
  1740. for (int mask = 16; mask > 0; mask >>= 1) {
  1741. tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
  1742. }
  1743. if (tid == 0) {
  1744. dst[row] = __float2half(tmp);
  1745. }
  1746. }
  1747. static __global__ void dequantize_mul_mat_vec_iq2_xxs(const void * __restrict__ vx, const dfloat * __restrict__ yy, dfloat * __restrict__ dst, const int ncols, int nrows) {
  1748. const int row = blockIdx.x*blockDim.y + threadIdx.y;
  1749. if (row > nrows) return;
  1750. const int num_blocks_per_row = ncols / QK_K;
  1751. const int ib0 = row*num_blocks_per_row;
  1752. const block_iq2_xxs * x = (const block_iq2_xxs *)vx + ib0;
  1753. float tmp = 0; // partial sum for thread in warp
  1754. const int tid = threadIdx.x/4;
  1755. const int ix = threadIdx.x%4;
  1756. const int q_offset = tid * 4;
  1757. const int y_offset = tid * 32;
  1758. for (int i = ix; i < num_blocks_per_row; i += 4) {
  1759. const half * y = yy + i * QK_K + y_offset;
  1760. const uint16_t * q = x[i].qs + q_offset;
  1761. const uint8_t * aux8 = (const uint8_t *)q;
  1762. uint32_t aux32 = q[2] | (q[3] << 16);
  1763. float sumi = 0;
  1764. for (int l = 0; l < 4; ++l) {
  1765. const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
  1766. const uint8_t signs = ksigns_iq2xs[aux32 & 127];
  1767. for (int j = 0; j < 8; ++j) {
  1768. sumi += __half2float(y[j]) * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
  1769. }
  1770. y += 8;
  1771. aux32 >>= 7;
  1772. }
  1773. tmp += sumi * __half2float(x[i].d) * (0.5f + aux32) * 0.25f;;
  1774. }
  1775. // sum up partial sums and write back result
  1776. #pragma unroll
  1777. for (int mask = 16; mask > 0; mask >>= 1) {
  1778. tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
  1779. }
  1780. if (threadIdx.x == 0) {
  1781. dst[row] = __float2half(tmp);
  1782. }
  1783. }
  1784. static __global__ void dequantize_mul_mat_vec_iq2_xs(const void * __restrict__ vx, const dfloat * __restrict__ yy, dfloat * __restrict__ dst, const int ncols, int nrows) {
  1785. const int row = blockIdx.x*blockDim.y + threadIdx.y;
  1786. if (row > nrows) return;
  1787. const int num_blocks_per_row = ncols / QK_K;
  1788. const int ib0 = row*num_blocks_per_row;
  1789. const block_iq2_xs * x = (const block_iq2_xs *)vx + ib0;
  1790. float tmp = 0; // partial sum for thread in warp
  1791. const int tid = threadIdx.x/4;
  1792. const int ix = threadIdx.x%4;
  1793. const int q_offset = tid * 4;
  1794. const int s_offset = tid;
  1795. const int y_offset = tid * 32;
  1796. for (int i = ix; i < num_blocks_per_row; i += 4) {
  1797. const half * y = yy + i * QK_K + y_offset;
  1798. const uint16_t * q = x[i].qs + q_offset;
  1799. const uint8_t ls1 = x[i].scales[s_offset] & 0xf;
  1800. const uint8_t ls2 = x[i].scales[s_offset] >> 4;
  1801. float sumi1 = 0;
  1802. for (int l = 0; l < 2; ++l) {
  1803. const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q[l] & 511));
  1804. const uint8_t signs = ksigns_iq2xs[q[l] >> 9];
  1805. for (int j = 0; j < 8; ++j) {
  1806. sumi1 += __half2float(y[j]) * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
  1807. }
  1808. y += 8;
  1809. }
  1810. float sumi2 = 0;
  1811. for (int l = 2; l < 4; ++l) {
  1812. const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q[l] & 511));
  1813. const uint8_t signs = ksigns_iq2xs[q[l] >> 9];
  1814. for (int j = 0; j < 8; ++j) {
  1815. sumi2 += __half2float(y[j]) * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
  1816. }
  1817. y += 8;
  1818. }
  1819. const float d = __half2float(x[i].d) * 0.25f;
  1820. tmp += d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);;
  1821. }
  1822. // sum up partial sums and write back result
  1823. #pragma unroll
  1824. for (int mask = 16; mask > 0; mask >>= 1) {
  1825. tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
  1826. }
  1827. if (threadIdx.x == 0) {
  1828. dst[row] = __float2half(tmp);
  1829. }
  1830. }
  1831. static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, dfloat * dst, const int ncols, const int nrows, cudaStream_t stream) {
  1832. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  1833. // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
  1834. const dim3 block_nums(block_num_y, 1, 1);
  1835. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  1836. dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
  1837. <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
  1838. }
  1839. static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, dfloat * dst, const int ncols, const int nrows, cudaStream_t stream) {
  1840. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  1841. const dim3 block_nums(block_num_y, 1, 1);
  1842. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  1843. dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
  1844. <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
  1845. }
  1846. static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, dfloat * dst, const int ncols, const int nrows, cudaStream_t stream) {
  1847. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  1848. const dim3 block_nums(block_num_y, 1, 1);
  1849. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  1850. dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
  1851. <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
  1852. }
  1853. static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, dfloat * dst, const int ncols, const int nrows, cudaStream_t stream) {
  1854. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  1855. const dim3 block_nums(block_num_y, 1, 1);
  1856. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  1857. dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
  1858. <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
  1859. }
  1860. static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, dfloat * dst, const int ncols, const int nrows, cudaStream_t stream) {
  1861. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  1862. const dim3 block_nums(block_num_y, 1, 1);
  1863. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  1864. dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
  1865. <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
  1866. }
  1867. static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const dfloat * y, dfloat * dst, const int ncols, const int nrows, cudaStream_t stream) {
  1868. const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
  1869. const int block_num_y = (nrows + ny - 1) / ny;
  1870. const dim3 block_nums(block_num_y, 1, 1);
  1871. const dim3 block_dims(32, ny, 1);
  1872. dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
  1873. }
  1874. static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const dfloat * y, dfloat * dst, const int ncols, const int nrows, cudaStream_t stream) {
  1875. const int ny = 2 / K_QUANTS_PER_ITERATION;
  1876. const int block_num_y = (nrows + ny - 1) / ny;
  1877. const dim3 block_nums(block_num_y, 1, 1);
  1878. const dim3 block_dims(32, ny, 1);
  1879. dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
  1880. }
  1881. static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const dfloat * y, dfloat * dst, const int ncols, const int nrows, cudaStream_t stream) {
  1882. const int ny = 2 / K_QUANTS_PER_ITERATION;
  1883. const int block_num_y = (nrows + ny - 1) / ny;
  1884. const dim3 block_nums(block_num_y, 1, 1);
  1885. const dim3 block_dims(32, ny, 1);
  1886. dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
  1887. }
  1888. static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const dfloat * y, dfloat * dst, const int ncols, const int nrows, cudaStream_t stream) {
  1889. const dim3 block_dims(32, 1, 1);
  1890. dequantize_mul_mat_vec_q5_k<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols);
  1891. }
  1892. static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const dfloat * y, dfloat * dst, const int ncols, const int nrows, cudaStream_t stream) {
  1893. const int ny = 2 / K_QUANTS_PER_ITERATION;
  1894. const int block_num_y = (nrows + ny - 1) / ny;
  1895. const dim3 block_nums(block_num_y, 1, 1);
  1896. const dim3 block_dims(32, ny, 1);
  1897. dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
  1898. }
  1899. static void dequantize_mul_mat_vec_iq2_xxs_cuda(const void * vx, const dfloat * y, dfloat * dst, const int ncols, const int nrows, cudaStream_t stream) {
  1900. const dim3 block_dims(32, 1, 1);
  1901. dequantize_mul_mat_vec_iq2_xxs<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
  1902. }
  1903. static void dequantize_mul_mat_vec_iq2_xs_cuda(const void * vx, const dfloat * y, dfloat * dst, const int ncols, const int nrows, cudaStream_t stream) {
  1904. const dim3 block_dims(32, 1, 1);
  1905. dequantize_mul_mat_vec_iq2_xs<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
  1906. }
  1907. // Q8 gemv
  1908. static __global__ void quantize_q8_1(const half * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
  1909. const int ix = blockDim.x*blockIdx.x + threadIdx.x;
  1910. if (ix >= kx_padded) {
  1911. return;
  1912. }
  1913. const int iy = blockDim.y*blockIdx.y + threadIdx.y;
  1914. const int i_padded = iy*kx_padded + ix;
  1915. block_q8_1 * y = (block_q8_1 *) vy;
  1916. const int ib = i_padded / QK8_1; // block index
  1917. const int iqs = i_padded % QK8_1; // quant index
  1918. const float xi = ix < kx ? __half2float(x[iy*kx + ix]) : 0.0f;
  1919. float amax = fabsf(xi);
  1920. float sum = xi;
  1921. #pragma unroll
  1922. for (int mask = 16; mask > 0; mask >>= 1) {
  1923. amax = fmaxf(amax, __shfl_xor_sync(0xffffffff, amax, mask, 32));
  1924. sum += __shfl_xor_sync(0xffffffff, sum, mask, 32);
  1925. }
  1926. const float d = amax / 127;
  1927. const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
  1928. y[ib].qs[iqs] = q;
  1929. if (iqs > 0) {
  1930. return;
  1931. }
  1932. y[ib].ds.x = __float2half(d);
  1933. y[ib].ds.y = __float2half(sum);
  1934. }
  1935. static void quantize_row_q8_1_cuda(const half * x, void * vy, const int kx, const int ky, cudaStream_t stream) {
  1936. const int64_t kx_padded = (kx + 512 - 1) / 512 * 512;
  1937. const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
  1938. const dim3 num_blocks(block_num_x, ky, 1);
  1939. const dim3 block_size(CUDA_DEQUANTIZE_BLOCK_SIZE, 1, 1);
  1940. quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
  1941. }
  1942. #define VDR_Q4_0_Q8_1_MMVQ 2
  1943. #define VDR_Q4_0_Q8_1_MMQ 4
  1944. template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl(
  1945. const int * v, const int * u, const float & d4, const half2 & ds8) {
  1946. int sumi = 0;
  1947. #pragma unroll
  1948. for (int i = 0; i < vdr; ++i) {
  1949. const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
  1950. const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
  1951. // SIMD dot product of quantized values
  1952. sumi = __dp4a(vi0, u[2*i+0], sumi);
  1953. sumi = __dp4a(vi1, u[2*i+1], sumi);
  1954. }
  1955. const float2 ds8f = __half22float2(ds8);
  1956. // second part effectively subtracts 8 from each quant value
  1957. return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
  1958. }
  1959. #define VDR_Q4_1_Q8_1_MMVQ 2
  1960. #define VDR_Q4_1_Q8_1_MMQ 4
  1961. template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl(
  1962. const int * v, const int * u, const half2 & dm4, const half2 & ds8) {
  1963. int sumi = 0;
  1964. #pragma unroll
  1965. for (int i = 0; i < vdr; ++i) {
  1966. const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
  1967. const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
  1968. // SIMD dot product of quantized values
  1969. sumi = __dp4a(vi0, u[2*i+0], sumi);
  1970. sumi = __dp4a(vi1, u[2*i+1], sumi);
  1971. }
  1972. const float2 tmp = __half22float2(__hmul2(dm4, ds8));
  1973. const float d4d8 = tmp.x;
  1974. const float m4s8 = tmp.y;
  1975. // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
  1976. return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
  1977. }
  1978. #define VDR_Q5_0_Q8_1_MMVQ 2
  1979. #define VDR_Q5_0_Q8_1_MMQ 4
  1980. template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl(
  1981. const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) {
  1982. int sumi = 0;
  1983. #pragma unroll
  1984. for (int i = 0; i < vdr; ++i) {
  1985. int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
  1986. vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
  1987. vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12
  1988. vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20
  1989. vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28
  1990. sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
  1991. int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
  1992. vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4
  1993. vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12
  1994. vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20
  1995. vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28
  1996. sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
  1997. }
  1998. const float2 ds8f = __half22float2(ds8);
  1999. // second part effectively subtracts 16 from each quant value
  2000. return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
  2001. }
  2002. #define VDR_Q5_1_Q8_1_MMVQ 2
  2003. #define VDR_Q5_1_Q8_1_MMQ 4
  2004. template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl(
  2005. const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) {
  2006. int sumi = 0;
  2007. #pragma unroll
  2008. for (int i = 0; i < vdr; ++i) {
  2009. int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
  2010. vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
  2011. vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12
  2012. vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20
  2013. vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28
  2014. sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
  2015. int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
  2016. vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4
  2017. vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12
  2018. vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20
  2019. vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28
  2020. sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
  2021. }
  2022. const float2 tmp = __half22float2(__hmul2(dm5, ds8));
  2023. const float d5d8 = tmp.x;
  2024. const float m5s8 = tmp.y;
  2025. // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
  2026. return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
  2027. }
  2028. #define VDR_Q8_0_Q8_1_MMVQ 2
  2029. #define VDR_Q8_0_Q8_1_MMQ 8
  2030. template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl(
  2031. const int * v, const int * u, const float & d8_0, const float & d8_1) {
  2032. int sumi = 0;
  2033. #pragma unroll
  2034. for (int i = 0; i < vdr; ++i) {
  2035. // SIMD dot product of quantized values
  2036. sumi = __dp4a(v[i], u[i], sumi);
  2037. }
  2038. return d8_0*d8_1 * sumi;
  2039. }
  2040. template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl(
  2041. const int * v, const int * u, const half2 & dm8, const half2 & ds8) {
  2042. int sumi = 0;
  2043. #pragma unroll
  2044. for (int i = 0; i < vdr; ++i) {
  2045. // SIMD dot product of quantized values
  2046. sumi = __dp4a(v[i], u[i], sumi);
  2047. }
  2048. const float2 tmp = __half22float2(__hmul2(dm8, ds8));
  2049. const float d8d8 = tmp.x;
  2050. const float m8s8 = tmp.y;
  2051. // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
  2052. return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
  2053. }
  2054. #define VDR_Q2_K_Q8_1_MMVQ 1
  2055. #define VDR_Q2_K_Q8_1_MMQ 2
  2056. // contiguous v/x values
  2057. static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
  2058. const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
  2059. const half2 & dm2, const float * __restrict__ d8) {
  2060. float sumf_d = 0.0f;
  2061. float sumf_m = 0.0f;
  2062. #pragma unroll
  2063. for (int i = 0; i < QR2_K; ++i) {
  2064. const int sc = scales[2*i];
  2065. const int vi = (v >> (2*i)) & 0x03030303;
  2066. sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
  2067. // fill int with 4x m
  2068. int m = sc >> 4;
  2069. m |= m << 8;
  2070. m |= m << 16;
  2071. sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
  2072. }
  2073. const float2 dm2f = __half22float2(dm2);
  2074. return dm2f.x*sumf_d - dm2f.y*sumf_m;
  2075. }
  2076. static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
  2077. const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
  2078. const half2 & dm2, const float & d8) {
  2079. int sumi_d = 0;
  2080. int sumi_m = 0;
  2081. #pragma unroll
  2082. for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) {
  2083. int sumi_d_sc = 0;
  2084. const int sc = scales[i0 / (QI8_1/2)];
  2085. // fill int with 4x m
  2086. int m = sc >> 4;
  2087. m |= m << 8;
  2088. m |= m << 16;
  2089. #pragma unroll
  2090. for (int i = i0; i < i0 + QI8_1/2; ++i) {
  2091. sumi_d_sc = __dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product
  2092. sumi_m = __dp4a(m, u[i], sumi_m); // multiply sum of q8_1 values with m
  2093. }
  2094. sumi_d += sumi_d_sc * (sc & 0xF);
  2095. }
  2096. const float2 dm2f = __half22float2(dm2);
  2097. return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
  2098. }
  2099. #define VDR_Q3_K_Q8_1_MMVQ 1
  2100. #define VDR_Q3_K_Q8_1_MMQ 2
  2101. // contiguous v/x values
  2102. static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
  2103. const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
  2104. const int & scale_offset, const float & d3, const float * __restrict__ d8) {
  2105. float sumf = 0.0f;
  2106. #pragma unroll
  2107. for (int i = 0; i < QR3_K; ++i) {
  2108. const int isc = scale_offset + 2*i;
  2109. const int isc_low = isc % (QK_K/32);
  2110. const int sc_shift_low = 4 * (isc / (QK_K/32));
  2111. const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF;
  2112. const int isc_high = isc % (QK_K/64);
  2113. const int sc_shift_high = 2 * (isc / (QK_K/64));
  2114. const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
  2115. const int sc = (sc_low | sc_high) - 32;
  2116. const int vil = (vl >> (2*i)) & 0x03030303;
  2117. const int vih = ((vh >> i) << 2) & 0x04040404;
  2118. const int vi = __vsubss4(vil, vih);
  2119. sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
  2120. }
  2121. return d3 * sumf;
  2122. }
  2123. static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
  2124. const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales,
  2125. const float & d3, const float & d8) {
  2126. int sumi = 0;
  2127. #pragma unroll
  2128. for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) {
  2129. int sumi_sc = 0;
  2130. for (int i = i0; i < i0 + QI8_1/2; ++i) {
  2131. sumi_sc = __dp4a(v[i], u[i], sumi_sc); // SIMD dot product
  2132. }
  2133. sumi += sumi_sc * scales[i0 / (QI8_1/2)];
  2134. }
  2135. return d3*d8 * sumi;
  2136. }
  2137. #define VDR_Q4_K_Q8_1_MMVQ 2
  2138. #define VDR_Q4_K_Q8_1_MMQ 8
  2139. // contiguous v/x values
  2140. static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
  2141. const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
  2142. const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
  2143. float sumf_d = 0.0f;
  2144. float sumf_m = 0.0f;
  2145. #pragma unroll
  2146. for (int i = 0; i < QR4_K; ++i) {
  2147. const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
  2148. const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
  2149. const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
  2150. const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u
  2151. sumf_d += d8[i] * (dot1 * sc[i]);
  2152. sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
  2153. }
  2154. const float2 dm4f = __half22float2(dm4);
  2155. return dm4f.x*sumf_d - dm4f.y*sumf_m;
  2156. }
  2157. static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
  2158. const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
  2159. const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
  2160. float sumf_d = 0.0f;
  2161. float sumf_m = 0.0f;
  2162. #pragma unroll
  2163. for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) {
  2164. int sumi_d = 0;
  2165. #pragma unroll
  2166. for (int j = 0; j < QI8_1; ++j) {
  2167. sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product
  2168. }
  2169. const float2 ds8f = __half22float2(ds8[i]);
  2170. sumf_d += ds8f.x * (sc[i] * sumi_d);
  2171. sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val
  2172. }
  2173. const float2 dm4f = __half22float2(dm4);
  2174. return dm4f.x*sumf_d - dm4f.y*sumf_m;
  2175. }
  2176. #define VDR_Q5_K_Q8_1_MMVQ 2
  2177. #define VDR_Q5_K_Q8_1_MMQ 8
  2178. static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
  2179. const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
  2180. const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
  2181. float sumf_d = 0.0f;
  2182. float sumf_m = 0.0f;
  2183. #pragma unroll
  2184. for (int i = 0; i < QR5_K; ++i) {
  2185. const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
  2186. const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
  2187. const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
  2188. const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
  2189. const int v0i = vl0i | vh0i;
  2190. const int v1i = vl1i | vh1i;
  2191. const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
  2192. const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u
  2193. sumf_d += d8[i] * (dot1 * sc[i]);
  2194. sumf_m += d8[i] * (dot2 * m[i]);
  2195. }
  2196. const float2 dm5f = __half22float2(dm5);
  2197. return dm5f.x*sumf_d - dm5f.y*sumf_m;
  2198. }
  2199. static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
  2200. const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
  2201. const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
  2202. float sumf_d = 0.0f;
  2203. float sumf_m = 0.0f;
  2204. #pragma unroll
  2205. for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) {
  2206. int sumi_d = 0;
  2207. #pragma unroll
  2208. for (int j = 0; j < QI8_1; ++j) {
  2209. sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product
  2210. }
  2211. const float2 ds8f = __half22float2(ds8[i]);
  2212. sumf_d += ds8f.x * (sc[i] * sumi_d);
  2213. sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val
  2214. }
  2215. const float2 dm4f = __half22float2(dm4);
  2216. return dm4f.x*sumf_d - dm4f.y*sumf_m;
  2217. }
  2218. #define VDR_Q6_K_Q8_1_MMVQ 1
  2219. #define VDR_Q6_K_Q8_1_MMQ 8
  2220. // contiguous v/x values
  2221. static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
  2222. const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
  2223. const float & d, const float * __restrict__ d8) {
  2224. float sumf = 0.0f;
  2225. #pragma unroll
  2226. for (int i = 0; i < QR6_K; ++i) {
  2227. const int sc = scales[4*i];
  2228. const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
  2229. const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
  2230. const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
  2231. sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
  2232. }
  2233. return d*sumf;
  2234. }
  2235. static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
  2236. const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc,
  2237. const float & d6, const float * __restrict__ d8) {
  2238. float sumf_d = 0.0f;
  2239. #pragma unroll
  2240. for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) {
  2241. int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale
  2242. #pragma unroll
  2243. for (int i = i0; i < i0 + 2; ++i) {
  2244. sumi_d.x = __dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product
  2245. sumi_d.x = __dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product
  2246. sumi_d.y = __dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product
  2247. sumi_d.y = __dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product
  2248. }
  2249. sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y);
  2250. }
  2251. return d6 * sumf_d;
  2252. }
  2253. static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
  2254. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  2255. const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
  2256. int v[VDR_Q4_0_Q8_1_MMVQ];
  2257. int u[2*VDR_Q4_0_Q8_1_MMVQ];
  2258. #pragma unroll
  2259. for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
  2260. v[i] = get_int_from_uint8(bq4_0->qs, iqs + i);
  2261. u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
  2262. u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
  2263. }
  2264. return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, __half2float(bq4_0->d), bq8_1->ds);
  2265. }
  2266. template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
  2267. __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
  2268. __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
  2269. *x_ql = tile_x_qs;
  2270. *x_dm = (half2 *) tile_x_d;
  2271. }
  2272. template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
  2273. const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
  2274. int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
  2275. const int kbx = k / QI4_0;
  2276. const int kqsx = k % QI4_0;
  2277. const block_q4_0 * bx0 = (const block_q4_0 *) vx;
  2278. float * x_dmf = (float *) x_dm;
  2279. #pragma unroll
  2280. for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
  2281. int i = i0 + i_offset;
  2282. if (need_check) {
  2283. i = min(i, i_max);
  2284. }
  2285. const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
  2286. x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
  2287. // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
  2288. }
  2289. const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
  2290. const int kbxd = k % blocks_per_tile_x_row;
  2291. #pragma unroll
  2292. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
  2293. int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
  2294. if (need_check) {
  2295. i = min(i, i_max);
  2296. }
  2297. const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
  2298. x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = __half2float(bxi->d);
  2299. }
  2300. }
  2301. static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
  2302. const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
  2303. const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
  2304. (void)x_qh; (void)x_sc;
  2305. const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
  2306. const float * x_dmf = (const float *) x_dm;
  2307. int u[2*VDR_Q4_0_Q8_1_MMQ];
  2308. #pragma unroll
  2309. for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
  2310. u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
  2311. u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
  2312. }
  2313. return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
  2314. (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
  2315. y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
  2316. }
  2317. static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
  2318. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  2319. const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
  2320. int v[VDR_Q4_1_Q8_1_MMVQ];
  2321. int u[2*VDR_Q4_1_Q8_1_MMVQ];
  2322. #pragma unroll
  2323. for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) {
  2324. v[i] = get_int_from_uint8_aligned(bq4_1->qs, iqs + i);
  2325. u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
  2326. u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1);
  2327. }
  2328. return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
  2329. }
  2330. template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
  2331. __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y];
  2332. __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
  2333. *x_ql = tile_x_qs;
  2334. *x_dm = tile_x_dm;
  2335. }
  2336. template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
  2337. const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
  2338. int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
  2339. const int kbx = k / QI4_1;
  2340. const int kqsx = k % QI4_1;
  2341. const block_q4_1 * bx0 = (const block_q4_1 *) vx;
  2342. #pragma unroll
  2343. for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
  2344. int i = i0 + i_offset;
  2345. if (need_check) {
  2346. i = min(i, i_max);
  2347. }
  2348. const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx;
  2349. x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
  2350. }
  2351. const int blocks_per_tile_x_row = WARP_SIZE / QI4_1;
  2352. const int kbxd = k % blocks_per_tile_x_row;
  2353. #pragma unroll
  2354. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
  2355. int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
  2356. if (need_check) {
  2357. i = min(i, i_max);
  2358. }
  2359. const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd;
  2360. x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm;
  2361. }
  2362. }
  2363. static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
  2364. const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
  2365. const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
  2366. const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
  2367. int u[2*VDR_Q4_1_Q8_1_MMQ];
  2368. #pragma unroll
  2369. for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
  2370. u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
  2371. u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
  2372. }
  2373. return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
  2374. (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
  2375. y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
  2376. }
  2377. static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
  2378. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  2379. const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
  2380. int vl[VDR_Q5_0_Q8_1_MMVQ];
  2381. int vh[VDR_Q5_0_Q8_1_MMVQ];
  2382. int u[2*VDR_Q5_0_Q8_1_MMVQ];
  2383. #pragma unroll
  2384. for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) {
  2385. vl[i] = get_int_from_uint8(bq5_0->qs, iqs + i);
  2386. vh[i] = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i));
  2387. u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
  2388. u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0);
  2389. }
  2390. return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, __half2float(bq5_0->d), bq8_1->ds);
  2391. }
  2392. template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
  2393. __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
  2394. __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
  2395. *x_ql = tile_x_ql;
  2396. *x_dm = (half2 *) tile_x_d;
  2397. }
  2398. template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
  2399. const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
  2400. int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
  2401. const int kbx = k / QI5_0;
  2402. const int kqsx = k % QI5_0;
  2403. const block_q5_0 * bx0 = (const block_q5_0 *) vx;
  2404. #pragma unroll
  2405. for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
  2406. int i = i0 + i_offset;
  2407. if (need_check) {
  2408. i = min(i, i_max);
  2409. }
  2410. const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx;
  2411. const int ql = get_int_from_uint8(bxi->qs, kqsx);
  2412. const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0));
  2413. int qs0 = (ql >> 0) & 0x0F0F0F0F;
  2414. qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
  2415. qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
  2416. qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
  2417. qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
  2418. qs0 = __vsubss4(qs0, 0x10101010); // subtract 16
  2419. x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
  2420. int qs1 = (ql >> 4) & 0x0F0F0F0F;
  2421. qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
  2422. qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
  2423. qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
  2424. qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
  2425. qs1 = __vsubss4(qs1, 0x10101010); // subtract 16
  2426. x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
  2427. }
  2428. const int blocks_per_tile_x_row = WARP_SIZE / QI5_0;
  2429. const int kbxd = k % blocks_per_tile_x_row;
  2430. float * x_dmf = (float *) x_dm;
  2431. #pragma unroll
  2432. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
  2433. int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
  2434. if (need_check) {
  2435. i = min(i, i_max);
  2436. }
  2437. const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd;
  2438. x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = __half2float(bxi->d);
  2439. }
  2440. }
  2441. static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
  2442. const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
  2443. const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
  2444. const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
  2445. const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
  2446. const float * x_dmf = (const float *) x_dm;
  2447. const float * y_df = (const float *) y_ds;
  2448. int u[2*VDR_Q5_0_Q8_1_MMQ];
  2449. #pragma unroll
  2450. for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
  2451. u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
  2452. u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
  2453. }
  2454. return vec_dot_q8_0_q8_1_impl<QR5_0*VDR_Q5_0_Q8_1_MMQ>
  2455. (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
  2456. }
  2457. static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
  2458. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  2459. const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
  2460. int vl[VDR_Q5_1_Q8_1_MMVQ];
  2461. int vh[VDR_Q5_1_Q8_1_MMVQ];
  2462. int u[2*VDR_Q5_1_Q8_1_MMVQ];
  2463. #pragma unroll
  2464. for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) {
  2465. vl[i] = get_int_from_uint8_aligned(bq5_1->qs, iqs + i);
  2466. vh[i] = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i));
  2467. u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
  2468. u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1);
  2469. }
  2470. return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
  2471. }
  2472. template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
  2473. __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
  2474. __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
  2475. *x_ql = tile_x_ql;
  2476. *x_dm = tile_x_dm;
  2477. }
  2478. template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
  2479. const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
  2480. int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
  2481. const int kbx = k / QI5_1;
  2482. const int kqsx = k % QI5_1;
  2483. const block_q5_1 * bx0 = (const block_q5_1 *) vx;
  2484. #pragma unroll
  2485. for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
  2486. int i = i0 + i_offset;
  2487. if (need_check) {
  2488. i = min(i, i_max);
  2489. }
  2490. const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx;
  2491. const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
  2492. const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1));
  2493. int qs0 = (ql >> 0) & 0x0F0F0F0F;
  2494. qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
  2495. qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
  2496. qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
  2497. qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
  2498. x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
  2499. int qs1 = (ql >> 4) & 0x0F0F0F0F;
  2500. qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
  2501. qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
  2502. qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
  2503. qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
  2504. x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
  2505. }
  2506. const int blocks_per_tile_x_row = WARP_SIZE / QI5_1;
  2507. const int kbxd = k % blocks_per_tile_x_row;
  2508. #pragma unroll
  2509. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
  2510. int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
  2511. if (need_check) {
  2512. i = min(i, i_max);
  2513. }
  2514. const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd;
  2515. x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm;
  2516. }
  2517. }
  2518. static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
  2519. const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
  2520. const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
  2521. const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
  2522. const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
  2523. int u[2*VDR_Q5_1_Q8_1_MMQ];
  2524. #pragma unroll
  2525. for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
  2526. u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
  2527. u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
  2528. }
  2529. return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
  2530. (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
  2531. }
  2532. static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
  2533. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  2534. const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
  2535. int v[VDR_Q8_0_Q8_1_MMVQ];
  2536. int u[VDR_Q8_0_Q8_1_MMVQ];
  2537. #pragma unroll
  2538. for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
  2539. v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
  2540. u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
  2541. }
  2542. return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, __half2float(bq8_0->d), __low2float(bq8_1->ds));
  2543. }
  2544. template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
  2545. __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
  2546. __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
  2547. *x_ql = tile_x_qs;
  2548. *x_dm = (half2 *) tile_x_d;
  2549. }
  2550. template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
  2551. const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
  2552. int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
  2553. const int kbx = k / QI8_0;
  2554. const int kqsx = k % QI8_0;
  2555. float * x_dmf = (float *) x_dm;
  2556. const block_q8_0 * bx0 = (const block_q8_0 *) vx;
  2557. #pragma unroll
  2558. for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
  2559. int i = i0 + i_offset;
  2560. if (need_check) {
  2561. i = min(i, i_max);
  2562. }
  2563. const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
  2564. x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
  2565. }
  2566. const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
  2567. const int kbxd = k % blocks_per_tile_x_row;
  2568. #pragma unroll
  2569. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
  2570. int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
  2571. if (need_check) {
  2572. i = min(i, i_max);
  2573. }
  2574. const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
  2575. x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = __half2float(bxi->d);
  2576. }
  2577. }
  2578. static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
  2579. const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
  2580. const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
  2581. const float * x_dmf = (const float *) x_dm;
  2582. const float * y_df = (const float *) y_ds;
  2583. return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ>
  2584. (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
  2585. y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
  2586. }
  2587. static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
  2588. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  2589. const block_q2_K * bq2_K = (const block_q2_K *) vbq;
  2590. const int bq8_offset = QR2_K * (iqs / QI8_1);
  2591. const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
  2592. const uint8_t * scales = bq2_K->scales + scale_offset;
  2593. const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
  2594. int u[QR2_K];
  2595. float d8[QR2_K];
  2596. #pragma unroll
  2597. for (int i = 0; i < QR2_K; ++ i) {
  2598. u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
  2599. d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
  2600. }
  2601. return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
  2602. }
  2603. template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
  2604. __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
  2605. __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
  2606. __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4];
  2607. *x_ql = tile_x_ql;
  2608. *x_dm = tile_x_dm;
  2609. *x_sc = tile_x_sc;
  2610. }
  2611. template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
  2612. const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
  2613. int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
  2614. const int kbx = k / QI2_K;
  2615. const int kqsx = k % QI2_K;
  2616. const block_q2_K * bx0 = (const block_q2_K *) vx;
  2617. #pragma unroll
  2618. for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
  2619. int i = i0 + i_offset;
  2620. if (need_check) {
  2621. i = min(i, i_max);
  2622. }
  2623. const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx;
  2624. x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
  2625. }
  2626. const int blocks_per_tile_x_row = WARP_SIZE / QI2_K;
  2627. const int kbxd = k % blocks_per_tile_x_row;
  2628. #pragma unroll
  2629. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
  2630. int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y;
  2631. if (need_check) {
  2632. i = min(i, i_max);
  2633. }
  2634. const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd;
  2635. x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm;
  2636. }
  2637. #pragma unroll
  2638. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
  2639. int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
  2640. if (need_check) {
  2641. i = min(i, i_max);
  2642. }
  2643. const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4);
  2644. x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4));
  2645. }
  2646. }
  2647. static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
  2648. const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
  2649. const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
  2650. const int kbx = k / QI2_K;
  2651. const int ky = (k % QI2_K) * QR2_K;
  2652. const float * y_df = (const float *) y_ds;
  2653. int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
  2654. const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
  2655. const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
  2656. #pragma unroll
  2657. for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) {
  2658. v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
  2659. }
  2660. const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
  2661. const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE;
  2662. return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
  2663. }
  2664. static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
  2665. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  2666. const block_q3_K * bq3_K = (const block_q3_K *) vbq;
  2667. const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
  2668. const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
  2669. const float d = __half2float(bq3_K->d);
  2670. const int vl = get_int_from_uint8(bq3_K->qs, iqs);
  2671. // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
  2672. const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
  2673. int u[QR3_K];
  2674. float d8[QR3_K];
  2675. #pragma unroll
  2676. for (int i = 0; i < QR3_K; ++i) {
  2677. u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
  2678. d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
  2679. }
  2680. return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
  2681. }
  2682. template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
  2683. __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
  2684. __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K];
  2685. __shared__ int tile_x_qh[mmq_y * (WARP_SIZE/2) + mmq_y/2];
  2686. __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4];
  2687. *x_ql = tile_x_ql;
  2688. *x_dm = tile_x_dm;
  2689. *x_qh = tile_x_qh;
  2690. *x_sc = tile_x_sc;
  2691. }
  2692. template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
  2693. const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
  2694. int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
  2695. const int kbx = k / QI3_K;
  2696. const int kqsx = k % QI3_K;
  2697. const block_q3_K * bx0 = (const block_q3_K *) vx;
  2698. #pragma unroll
  2699. for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
  2700. int i = i0 + i_offset;
  2701. if (need_check) {
  2702. i = min(i, i_max);
  2703. }
  2704. const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx;
  2705. x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
  2706. }
  2707. const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
  2708. const int kbxd = k % blocks_per_tile_x_row;
  2709. float * x_dmf = (float *) x_dm;
  2710. #pragma unroll
  2711. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
  2712. int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y;
  2713. if (need_check) {
  2714. i = min(i, i_max);
  2715. }
  2716. const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
  2717. x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = __half2float(bxi->d);
  2718. }
  2719. #pragma unroll
  2720. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
  2721. int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
  2722. if (need_check) {
  2723. i = min(i, i_max);
  2724. }
  2725. const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2);
  2726. // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
  2727. x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
  2728. }
  2729. #pragma unroll
  2730. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
  2731. int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
  2732. if (need_check) {
  2733. i = min(i, i_max);
  2734. }
  2735. const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4);
  2736. const int ksc = k % (QI3_K/4);
  2737. const int ksc_low = ksc % (QI3_K/8);
  2738. const int shift_low = 4 * (ksc / (QI3_K/8));
  2739. const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;
  2740. const int ksc_high = QI3_K/8;
  2741. const int shift_high = 2 * ksc;
  2742. const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;
  2743. const int sc = __vsubss4(sc_low | sc_high, 0x20202020);
  2744. x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc;
  2745. }
  2746. }
  2747. static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
  2748. const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
  2749. const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
  2750. const int kbx = k / QI3_K;
  2751. const int ky = (k % QI3_K) * QR3_K;
  2752. const float * x_dmf = (const float *) x_dm;
  2753. const float * y_df = (const float *) y_ds;
  2754. const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
  2755. int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
  2756. #pragma unroll
  2757. for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
  2758. const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
  2759. const int shift = 2 * ((ky % 32) / 8);
  2760. const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
  2761. const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
  2762. const int vlh = (vh << 2) & 0x04040404;
  2763. v[l] = __vsubss4(vll, vlh);
  2764. }
  2765. const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE;
  2766. return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
  2767. }
  2768. static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
  2769. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  2770. const block_q4_K * bq4_K = (const block_q4_K *) vbq;
  2771. int v[2];
  2772. int u[2*QR4_K];
  2773. float d8[QR4_K];
  2774. // iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6
  2775. const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2));
  2776. // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
  2777. // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
  2778. // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
  2779. // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
  2780. const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
  2781. v[0] = q4[0];
  2782. v[1] = q4[4];
  2783. const uint16_t * scales = (const uint16_t *)bq4_K->scales;
  2784. uint16_t aux[2];
  2785. const int j = bq8_offset/2;
  2786. if (j < 2) {
  2787. aux[0] = scales[j+0] & 0x3f3f;
  2788. aux[1] = scales[j+2] & 0x3f3f;
  2789. } else {
  2790. aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
  2791. aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
  2792. }
  2793. const uint8_t * sc = (const uint8_t *)aux;
  2794. const uint8_t * m = sc + 2;
  2795. for (int i = 0; i < QR4_K; ++i) {
  2796. const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
  2797. d8[i] = __low2float(bq8i->ds);
  2798. const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
  2799. u[2*i+0] = q8[0];
  2800. u[2*i+1] = q8[4];
  2801. }
  2802. return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
  2803. }
  2804. template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
  2805. __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
  2806. __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
  2807. __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
  2808. *x_ql = tile_x_ql;
  2809. *x_dm = tile_x_dm;
  2810. *x_sc = tile_x_sc;
  2811. }
  2812. template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
  2813. const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
  2814. int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
  2815. const int kbx = k / QI4_K; // == 0 if QK_K == 256
  2816. const int kqsx = k % QI4_K; // == k if QK_K == 256
  2817. const block_q4_K * bx0 = (const block_q4_K *) vx;
  2818. #pragma unroll
  2819. for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
  2820. int i = i0 + i_offset;
  2821. if (need_check) {
  2822. i = min(i, i_max);
  2823. }
  2824. const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx;
  2825. x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
  2826. }
  2827. const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
  2828. const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
  2829. #pragma unroll
  2830. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
  2831. int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y;
  2832. if (need_check) {
  2833. i = min(i, i_max);
  2834. }
  2835. const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
  2836. x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
  2837. }
  2838. #pragma unroll
  2839. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
  2840. int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
  2841. if (need_check) {
  2842. i = min(i, i_max);
  2843. }
  2844. const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
  2845. const int * scales = (const int *) bxi->scales;
  2846. const int ksc = k % (WARP_SIZE/8);
  2847. // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
  2848. int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
  2849. scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
  2850. x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
  2851. }
  2852. }
  2853. static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
  2854. const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
  2855. const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
  2856. (void)x_qh;
  2857. const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
  2858. const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
  2859. return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8,
  2860. x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
  2861. }
  2862. static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
  2863. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  2864. const block_q5_K * bq5_K = (const block_q5_K *) vbq;
  2865. int vl[2];
  2866. int vh[2];
  2867. int u[2*QR5_K];
  2868. float d8[QR5_K];
  2869. const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2));
  2870. const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
  2871. const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4));
  2872. vl[0] = ql[0];
  2873. vl[1] = ql[4];
  2874. vh[0] = qh[0] >> bq8_offset;
  2875. vh[1] = qh[4] >> bq8_offset;
  2876. const uint16_t * scales = (const uint16_t *)bq5_K->scales;
  2877. uint16_t aux[2];
  2878. const int j = bq8_offset/2;
  2879. if (j < 2) {
  2880. aux[0] = scales[j+0] & 0x3f3f;
  2881. aux[1] = scales[j+2] & 0x3f3f;
  2882. } else {
  2883. aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
  2884. aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
  2885. }
  2886. const uint8_t * sc = (const uint8_t *)aux;
  2887. const uint8_t * m = sc + 2;
  2888. #pragma unroll
  2889. for (int i = 0; i < QR5_K; ++i) {
  2890. const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
  2891. d8[i] = __low2float(bq8i->ds);
  2892. const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
  2893. u[2*i+0] = q8[0];
  2894. u[2*i+1] = q8[4];
  2895. }
  2896. return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
  2897. }
  2898. template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
  2899. __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
  2900. __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
  2901. __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
  2902. *x_ql = tile_x_ql;
  2903. *x_dm = tile_x_dm;
  2904. *x_sc = tile_x_sc;
  2905. }
  2906. template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
  2907. const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
  2908. int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
  2909. const int kbx = k / QI5_K; // == 0 if QK_K == 256
  2910. const int kqsx = k % QI5_K; // == k if QK_K == 256
  2911. const block_q5_K * bx0 = (const block_q5_K *) vx;
  2912. #pragma unroll
  2913. for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
  2914. int i = i0 + i_offset;
  2915. if (need_check) {
  2916. i = min(i, i_max);
  2917. }
  2918. const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
  2919. const int ky = QR5_K*kqsx;
  2920. const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
  2921. const int ql0 = (ql >> 0) & 0x0F0F0F0F;
  2922. const int ql1 = (ql >> 4) & 0x0F0F0F0F;
  2923. const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4));
  2924. const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010;
  2925. const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010;
  2926. const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0;
  2927. const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4);
  2928. x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0;
  2929. x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
  2930. }
  2931. const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
  2932. const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
  2933. #pragma unroll
  2934. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
  2935. int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y;
  2936. if (need_check) {
  2937. i = min(i, i_max);
  2938. }
  2939. const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
  2940. x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
  2941. }
  2942. #pragma unroll
  2943. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
  2944. int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
  2945. if (need_check) {
  2946. i = min(i, i_max);
  2947. }
  2948. const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
  2949. const int * scales = (const int *) bxi->scales;
  2950. const int ksc = k % (WARP_SIZE/8);
  2951. // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
  2952. int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
  2953. scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
  2954. x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
  2955. }
  2956. }
  2957. static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
  2958. const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
  2959. const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
  2960. const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
  2961. const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k;
  2962. const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE;
  2963. return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
  2964. x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
  2965. }
  2966. static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
  2967. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  2968. const block_q6_K * bq6_K = (const block_q6_K *) vbq;
  2969. const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
  2970. const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
  2971. const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
  2972. const int vl = get_int_from_uint8(bq6_K->ql, iqs);
  2973. const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift;
  2974. const int8_t * scales = bq6_K->scales + scale_offset;
  2975. int u[QR6_K];
  2976. float d8[QR6_K];
  2977. #pragma unroll
  2978. for (int i = 0; i < QR6_K; ++i) {
  2979. u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
  2980. d8[i] = __low2float(bq8_1[bq8_offset + 2*i].ds);
  2981. }
  2982. return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, __half2float(bq6_K->d), d8);
  2983. }
  2984. template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
  2985. __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
  2986. __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
  2987. __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
  2988. *x_ql = tile_x_ql;
  2989. *x_dm = tile_x_dm;
  2990. *x_sc = tile_x_sc;
  2991. }
  2992. template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
  2993. const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
  2994. int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
  2995. const int kbx = k / QI6_K; // == 0 if QK_K == 256
  2996. const int kqsx = k % QI6_K; // == k if QK_K == 256
  2997. const block_q6_K * bx0 = (const block_q6_K *) vx;
  2998. #pragma unroll
  2999. for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
  3000. int i = i0 + i_offset;
  3001. if (need_check) {
  3002. i = min(i, i_max);
  3003. }
  3004. const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
  3005. const int ky = QR6_K*kqsx;
  3006. const int ql = get_int_from_uint8(bxi->ql, kqsx);
  3007. const int ql0 = (ql >> 0) & 0x0F0F0F0F;
  3008. const int ql1 = (ql >> 4) & 0x0F0F0F0F;
  3009. const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4));
  3010. const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030;
  3011. const int qh1 = (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) & 0x30303030;
  3012. const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0;
  3013. const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2);
  3014. x_ql[i * (2*WARP_SIZE + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
  3015. x_ql[i * (2*WARP_SIZE + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
  3016. }
  3017. const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
  3018. const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
  3019. float * x_dmf = (float *) x_dm;
  3020. #pragma unroll
  3021. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
  3022. int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y;
  3023. if (need_check) {
  3024. i = min(i, i_max);
  3025. }
  3026. const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
  3027. x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = __half2float(bxi->d);
  3028. }
  3029. #pragma unroll
  3030. for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
  3031. int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
  3032. if (need_check) {
  3033. i = min(i, i_max);
  3034. }
  3035. const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4;
  3036. x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8));
  3037. }
  3038. }
  3039. static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
  3040. const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
  3041. const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
  3042. const float * x_dmf = (const float *) x_dm;
  3043. const float * y_df = (const float *) y_ds;
  3044. const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]);
  3045. const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k;
  3046. const int index_y = j * WARP_SIZE + (QR6_K*k) % WARP_SIZE;
  3047. return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
  3048. }
  3049. static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
  3050. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  3051. const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
  3052. const int ib32 = iqs;
  3053. const uint16_t * q2 = bq2->qs + 4*ib32;
  3054. const uint8_t * aux8 = (const uint8_t *)q2;
  3055. const int8_t * q8 = bq8_1[ib32].qs;
  3056. uint32_t aux32 = q2[2] | (q2[3] << 16);
  3057. int sumi = 0;
  3058. for (int l = 0; l < 4; ++l) {
  3059. const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
  3060. const uint8_t signs = ksigns_iq2xs[aux32 & 127];
  3061. for (int j = 0; j < 8; ++j) {
  3062. sumi += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
  3063. }
  3064. q8 += 8;
  3065. aux32 >>= 7;
  3066. }
  3067. const float d = __half2float(bq2->d) * (0.5f + aux32) * __half2float(bq8_1[ib32].ds.x) * 0.25f;
  3068. return d * sumi;
  3069. }
  3070. static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
  3071. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  3072. const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
  3073. const int ib32 = iqs;
  3074. const uint16_t * q2 = bq2->qs + 4*ib32;
  3075. const int8_t * q8 = bq8_1[ib32].qs;
  3076. const uint8_t ls1 = bq2->scales[ib32] & 0xf;
  3077. const uint8_t ls2 = bq2->scales[ib32] >> 4;
  3078. int sumi1 = 0;
  3079. for (int l = 0; l < 2; ++l) {
  3080. const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
  3081. const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
  3082. for (int j = 0; j < 8; ++j) {
  3083. sumi1 += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
  3084. }
  3085. q8 += 8;
  3086. }
  3087. int sumi2 = 0;
  3088. for (int l = 2; l < 4; ++l) {
  3089. const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
  3090. const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
  3091. for (int j = 0; j < 8; ++j) {
  3092. sumi2 += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
  3093. }
  3094. q8 += 8;
  3095. }
  3096. const float d = __half2float(bq2->d) * __half2float(bq8_1[ib32].ds.x) * 0.25f;
  3097. return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
  3098. }
  3099. static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
  3100. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  3101. const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
  3102. const int ib32 = iqs;
  3103. const int8_t * q8 = bq8_1[ib32].qs;
  3104. const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32;
  3105. const uint8_t ls1 = bq2->scales[ib32] & 0xf;
  3106. const uint8_t ls2 = bq2->scales[ib32] >> 4;
  3107. int sumi1 = 0;
  3108. for (int l = 0; l < 2; ++l) {
  3109. const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
  3110. const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
  3111. const uint32_t signs1 = __vcmpeq4(((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201);
  3112. const int grid_l = __vsub4(grid[0] ^ signs0, signs0);
  3113. const int grid_h = __vsub4(grid[1] ^ signs1, signs1);
  3114. sumi1 = __dp4a(grid_l, *((const int *)q8 + 0), sumi1);
  3115. sumi1 = __dp4a(grid_h, *((const int *)q8 + 1), sumi1);
  3116. q8 += 8;
  3117. }
  3118. int sumi2 = 0;
  3119. for (int l = 2; l < 4; ++l) {
  3120. const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
  3121. const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
  3122. const uint32_t signs1 = __vcmpeq4(((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201);
  3123. const int grid_l = __vsub4(grid[0] ^ signs0, signs0);
  3124. const int grid_h = __vsub4(grid[1] ^ signs1, signs1);
  3125. sumi2 = __dp4a(grid_l, *((const int *)q8 + 0), sumi2);
  3126. sumi2 = __dp4a(grid_h, *((const int *)q8 + 1), sumi2);
  3127. q8 += 8;
  3128. }
  3129. const float d = __half2float(bq2->d) * __low2float(bq8_1[ib32].ds) * 0.25f;
  3130. return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
  3131. }
  3132. static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
  3133. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  3134. const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
  3135. const int ib32 = iqs;
  3136. const uint8_t * q3 = bq2->qs + 8*ib32;
  3137. const uint16_t * gas = (const uint16_t *)(bq2->qs + QK_K/4) + 2*ib32;
  3138. const int8_t * q8 = bq8_1[ib32].qs;
  3139. uint32_t aux32 = gas[0] | (gas[1] << 16);
  3140. int sumi = 0;
  3141. for (int l = 0; l < 4; ++l) {
  3142. const uint32_t * grid1 = iq3xxs_grid + q3[2*l+0];
  3143. const uint32_t * grid2 = iq3xxs_grid + q3[2*l+1];
  3144. const uint32_t * signs = (const uint32_t *)(ksigns64 + (aux32 & 127));
  3145. const int grid_l = __vsub4(grid1[0] ^ signs[0], signs[0]);
  3146. const int grid_h = __vsub4(grid2[0] ^ signs[1], signs[1]);
  3147. sumi = __dp4a(grid_l, *((int *)q8+0), sumi);
  3148. sumi = __dp4a(grid_h, *((int *)q8+1), sumi);
  3149. q8 += 8;
  3150. aux32 >>= 7;
  3151. }
  3152. const float d = __half2float(bq2->d) * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.5f;
  3153. return d * sumi;
  3154. }
  3155. static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
  3156. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  3157. const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
  3158. const int ib32 = iqs;
  3159. const uint8_t * qs = bq2->qs + 8*ib32;
  3160. const int8_t * q8 = bq8_1[ib32].qs;
  3161. int sumi = 0;
  3162. for (int l = 0; l < 4; ++l) {
  3163. const uint32_t * grid1 = iq3xs_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256));
  3164. const uint32_t * grid2 = iq3xs_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256));
  3165. uint32_t signs0 = __vcmpeq4(((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
  3166. uint32_t signs1 = __vcmpeq4(((bq2->signs[4*ib32+l] >> 4) * 0x01010101) & 0x08040201, 0x08040201);
  3167. const int grid_l = __vsub4(grid1[0] ^ signs0, signs0);
  3168. const int grid_h = __vsub4(grid2[0] ^ signs1, signs1);
  3169. sumi = __dp4a(grid_l, *((int *)q8+0), sumi);
  3170. sumi = __dp4a(grid_h, *((int *)q8+1), sumi);
  3171. q8 += 8;
  3172. }
  3173. const float d = __half2float(bq2->d) * (0.5f + ((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * __low2float(bq8_1[ib32].ds) * 0.5f;
  3174. return d * sumi;
  3175. }
  3176. static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
  3177. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  3178. const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
  3179. const int ib32 = iqs;
  3180. int sumi1 = 0, sumi2 = 0, sumi3 = 0, sumi4 = 0;
  3181. const uint8_t h1 = bq1->scales[2*ib32+0];
  3182. const uint8_t h2 = bq1->scales[2*ib32+1];
  3183. const int * q8 = (const int *)bq8_1[ib32].qs;
  3184. const int * grid1 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+0] | ((h1 & 0x08) << 5)));
  3185. const int * grid2 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+1] | ((h1 & 0x80) << 1)));
  3186. const int * grid3 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+2] | ((h2 & 0x08) << 5)));
  3187. const int * grid4 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+3] | ((h2 & 0x80) << 1)));
  3188. for (int j = 0; j < 2; ++j) {
  3189. sumi1 = __dp4a(q8[j+0], grid1[j], sumi1);
  3190. sumi2 = __dp4a(q8[j+2], grid2[j], sumi2);
  3191. sumi3 = __dp4a(q8[j+4], grid3[j], sumi3);
  3192. sumi4 = __dp4a(q8[j+6], grid4[j], sumi4);
  3193. }
  3194. const float d = __half2float(bq1->d) * __low2float(bq8_1[ib32].ds);
  3195. return d * (sumi1 * (2*(h1 & 7) + 1) + sumi2 * (2*((h1 >> 4) & 7) + 1) +
  3196. sumi3 * (2*(h2 & 7) + 1) + sumi4 * (2*((h2 >> 4) & 7) + 1));
  3197. }
  3198. static __device__ __forceinline__ void get_int_from_table_16(const uint32_t & q4, const uint8_t * values,
  3199. int & val1, int & val2) {
  3200. uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32;
  3201. aux32 = q4 & 0x0f0f0f0f;
  3202. uint16_t v1 = values[q8[0]] | (values[q8[1]] << 8);
  3203. uint16_t v2 = values[q8[2]] | (values[q8[3]] << 8);
  3204. val1 = v1 | (v2 << 16);
  3205. aux32 = (q4 >> 4) & 0x0f0f0f0f;
  3206. v1 = values[q8[0]] | (values[q8[1]] << 8);
  3207. v2 = values[q8[2]] | (values[q8[3]] << 8);
  3208. val2 = v1 | (v2 << 16);
  3209. }
  3210. static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
  3211. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  3212. const block_iq4_nl * bq = (const block_iq4_nl *) vbq;
  3213. const uint16_t * q4 = (const uint16_t *)bq->qs + 2*iqs;
  3214. const int32_t * q8 = (const int32_t *)bq8_1->qs + iqs;
  3215. const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
  3216. int v1, v2;
  3217. int sumi1 = 0, sumi2 = 0;
  3218. for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) {
  3219. const uint32_t aux = q4[2*l] | (q4[2*l+1] << 16);
  3220. get_int_from_table_16(aux, values, v1, v2);
  3221. sumi1 = __dp4a(v1, q8[l+0], sumi1);
  3222. sumi2 = __dp4a(v2, q8[l+4], sumi2);
  3223. }
  3224. const float d = __half2float(bq->d) * __low2float(bq8_1->ds);
  3225. return d * (sumi1 + sumi2);
  3226. }
  3227. static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
  3228. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
  3229. const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
  3230. const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
  3231. // iqs is 0...7
  3232. const int ib32 = iqs;
  3233. const int32_t * q8 = (const int *)bq8_1[ib32].qs;
  3234. const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32;
  3235. const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4);
  3236. const float d = __half2float(bq4->d) * (ls - 32) * __low2float(bq8_1[ib32].ds);
  3237. int v1, v2;
  3238. int sumi1 = 0, sumi2 = 0;
  3239. for (int j = 0; j < 4; ++j) {
  3240. get_int_from_table_16(q4[j], values, v1, v2);
  3241. sumi1 = __dp4a(v1, q8[j+0], sumi1);
  3242. sumi2 = __dp4a(v2, q8[j+4], sumi2);
  3243. }
  3244. return d * (sumi1 + sumi2);
  3245. }
  3246. template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
  3247. static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst, const int ncols, const int nrows) {
  3248. const int row = blockIdx.x*blockDim.y + threadIdx.y;
  3249. if (row >= nrows) {
  3250. return;
  3251. }
  3252. const int blocks_per_row = ncols / qk;
  3253. const int blocks_per_warp = vdr * WARP_SIZE / qi;
  3254. // partial sum for each thread
  3255. float tmp = 0.0f;
  3256. const block_q_t * x = (const block_q_t *) vx;
  3257. const block_q8_1 * y = (const block_q8_1 *) vy;
  3258. for (int i = threadIdx.x / (qi/vdr); i < blocks_per_row; i += blocks_per_warp) {
  3259. const int ibx = row*blocks_per_row + i; // x block index
  3260. const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
  3261. const int iqs = vdr * (threadIdx.x % (qi/vdr)); // x block quant index when casting the quants to int
  3262. tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
  3263. }
  3264. // sum up partial sums and write back result
  3265. #pragma unroll
  3266. for (int mask = 16; mask > 0; mask >>= 1) {
  3267. tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
  3268. }
  3269. if (threadIdx.x == 0) {
  3270. dst[row] = __float2half(tmp);
  3271. }
  3272. }
  3273. static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
  3274. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  3275. const dim3 block_nums(block_num_y, 1, 1);
  3276. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  3277. mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
  3278. <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
  3279. }
  3280. static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
  3281. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  3282. const dim3 block_nums(block_num_y, 1, 1);
  3283. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  3284. mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
  3285. <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
  3286. }
  3287. static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
  3288. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  3289. const dim3 block_nums(block_num_y, 1, 1);
  3290. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  3291. mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
  3292. <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
  3293. }
  3294. static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
  3295. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  3296. const dim3 block_nums(block_num_y, 1, 1);
  3297. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  3298. mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
  3299. <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
  3300. }
  3301. static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
  3302. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  3303. const dim3 block_nums(block_num_y, 1, 1);
  3304. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  3305. mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
  3306. <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
  3307. }
  3308. static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
  3309. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  3310. const dim3 block_nums(block_num_y, 1, 1);
  3311. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  3312. mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
  3313. <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
  3314. }
  3315. static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
  3316. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  3317. const dim3 block_nums(block_num_y, 1, 1);
  3318. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  3319. mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
  3320. <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
  3321. }
  3322. static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
  3323. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  3324. const dim3 block_nums(block_num_y, 1, 1);
  3325. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  3326. mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
  3327. <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
  3328. }
  3329. static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
  3330. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  3331. const dim3 block_nums(block_num_y, 1, 1);
  3332. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  3333. mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
  3334. <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
  3335. }
  3336. static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
  3337. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  3338. const dim3 block_nums(block_num_y, 1, 1);
  3339. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  3340. mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
  3341. <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
  3342. }
  3343. static void mul_mat_vec_iq2_xxs_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
  3344. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  3345. const dim3 block_nums(block_num_y, 1, 1);
  3346. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  3347. mul_mat_vec_q<QK_K, QI2_XXS, block_iq2_xxs, 1, vec_dot_iq2_xxs_q8_1>
  3348. <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
  3349. }
  3350. static void mul_mat_vec_iq2_xs_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
  3351. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  3352. const dim3 block_nums(block_num_y, 1, 1);
  3353. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  3354. mul_mat_vec_q<QK_K, QI2_XS, block_iq2_xs, 1, vec_dot_iq2_xs_q8_1>
  3355. <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
  3356. }
  3357. static void mul_mat_vec_iq2_s_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
  3358. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  3359. const dim3 block_nums(block_num_y, 1, 1);
  3360. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  3361. mul_mat_vec_q<QK_K, QI2_S, block_iq2_s, 1, vec_dot_iq2_s_q8_1>
  3362. <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
  3363. }
  3364. static void mul_mat_vec_iq3_xxs_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
  3365. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  3366. const dim3 block_nums(block_num_y, 1, 1);
  3367. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  3368. mul_mat_vec_q<QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
  3369. <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
  3370. }
  3371. static void mul_mat_vec_iq1_s_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
  3372. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  3373. const dim3 block_nums(block_num_y, 1, 1);
  3374. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  3375. mul_mat_vec_q<QK_K, QI1_S, block_iq1_s, 1, vec_dot_iq1_s_q8_1>
  3376. <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
  3377. }
  3378. static void mul_mat_vec_iq4_nl_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
  3379. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  3380. const dim3 block_nums(block_num_y, 1, 1);
  3381. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  3382. mul_mat_vec_q<QK4_NL, QI4_NL, block_iq4_nl, VDR_Q4_0_Q8_1_MMVQ, vec_dot_iq4_nl_q8_1>
  3383. <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
  3384. }
  3385. static void mul_mat_vec_iq4_xs_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
  3386. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  3387. const dim3 block_nums(block_num_y, 1, 1);
  3388. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  3389. mul_mat_vec_q<QK_K, QI4_XS, block_iq4_xs, 1, vec_dot_iq4_xs_q8_1>
  3390. <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
  3391. }
  3392. static void mul_mat_vec_iq3_s_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
  3393. const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
  3394. const dim3 block_nums(block_num_y, 1, 1);
  3395. const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
  3396. mul_mat_vec_q<QK_K, QI3_XS, block_iq3_s, 1, vec_dot_iq3_s_q8_1>
  3397. <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
  3398. }
  3399. template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
  3400. allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
  3401. static __device__ __forceinline__ void mul_mat_q(
  3402. const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
  3403. const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
  3404. const block_q_t * x = (const block_q_t *) vx;
  3405. const block_q8_1 * y = (const block_q8_1 *) vy;
  3406. const int blocks_per_row_x = ncols_x / qk;
  3407. const int blocks_per_col_y = nrows_y / QK8_1;
  3408. const int blocks_per_warp = WARP_SIZE / qi;
  3409. const int & ncols_dst = ncols_y;
  3410. const int row_dst_0 = blockIdx.x*mmq_y;
  3411. const int & row_x_0 = row_dst_0;
  3412. const int col_dst_0 = blockIdx.y*mmq_x;
  3413. const int & col_y_0 = col_dst_0;
  3414. int * tile_x_ql = nullptr;
  3415. half2 * tile_x_dm = nullptr;
  3416. int * tile_x_qh = nullptr;
  3417. int * tile_x_sc = nullptr;
  3418. allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
  3419. __shared__ int tile_y_qs[mmq_x * WARP_SIZE];
  3420. __shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
  3421. float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};
  3422. for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
  3423. load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
  3424. threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x);
  3425. #pragma unroll
  3426. for (int ir = 0; ir < qr; ++ir) {
  3427. const int kqs = ir*WARP_SIZE + threadIdx.x;
  3428. const int kbxd = kqs / QI8_1;
  3429. #pragma unroll
  3430. for (int i = 0; i < mmq_x; i += nwarps) {
  3431. const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
  3432. const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
  3433. const int index_y = (threadIdx.y + i) * WARP_SIZE + kqs % WARP_SIZE;
  3434. tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
  3435. }
  3436. #pragma unroll
  3437. for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
  3438. const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x;
  3439. const int kby = threadIdx.x % (WARP_SIZE/QI8_1);
  3440. const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
  3441. // if the sum is not needed it's faster to transform the scale to f32 ahead of time
  3442. const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE/QI8_1) + kby].ds;
  3443. half2 * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby];
  3444. if (need_sum) {
  3445. *dsi_dst = *dsi_src;
  3446. } else {
  3447. float * dfi_dst = (float *) dsi_dst;
  3448. *dfi_dst = __low2float(*dsi_src);
  3449. }
  3450. }
  3451. __syncthreads();
  3452. // #pragma unroll // unrolling this loop causes too much register pressure
  3453. for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) {
  3454. #pragma unroll
  3455. for (int j = 0; j < mmq_x; j += nwarps) {
  3456. #pragma unroll
  3457. for (int i = 0; i < mmq_y; i += WARP_SIZE) {
  3458. sum[i/WARP_SIZE][j/nwarps] += vec_dot(
  3459. tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
  3460. threadIdx.x + i, threadIdx.y + j, k);
  3461. }
  3462. }
  3463. }
  3464. __syncthreads();
  3465. }
  3466. }
  3467. #pragma unroll
  3468. for (int j = 0; j < mmq_x; j += nwarps) {
  3469. const int col_dst = col_dst_0 + j + threadIdx.y;
  3470. if (col_dst >= ncols_dst) {
  3471. return;
  3472. }
  3473. #pragma unroll
  3474. for (int i = 0; i < mmq_y; i += WARP_SIZE) {
  3475. const int row_dst = row_dst_0 + threadIdx.x + i;
  3476. if (row_dst >= nrows_dst) {
  3477. continue;
  3478. }
  3479. dst[col_dst*nrows_dst + row_dst] = __float2half(sum[i/WARP_SIZE][j/nwarps]);
  3480. }
  3481. }
  3482. }
  3483. #if defined(USE_ROCM)
  3484. #define MMQ_X_Q4_0 64
  3485. #define MMQ_Y_Q4_0 128
  3486. #define NWARPS_Q4_0 8
  3487. #else
  3488. #define MMQ_X_Q4_0 4
  3489. #define MMQ_Y_Q4_0 32
  3490. #define NWARPS_Q4_0 4
  3491. #endif
  3492. template <bool need_check> static __global__ void
  3493. #if defined(USE_ROCM)
  3494. __launch_bounds__(WARP_SIZE*NWARPS_Q4_0, 2)
  3495. #endif
  3496. mul_mat_q4_0(
  3497. const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
  3498. const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
  3499. const int mmq_x = MMQ_X_Q4_0;
  3500. const int mmq_y = MMQ_Y_Q4_0;
  3501. const int nwarps = NWARPS_Q4_0;
  3502. mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
  3503. load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
  3504. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3505. }
  3506. static void ggml_mul_mat_q4_0_q8_1_cuda(
  3507. const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
  3508. const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
  3509. int mmq_x = MMQ_X_Q4_0;
  3510. int mmq_y = MMQ_Y_Q4_0;
  3511. int nwarps = NWARPS_Q4_0;
  3512. const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
  3513. const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
  3514. const dim3 block_nums(block_num_x, block_num_y, 1);
  3515. const dim3 block_dims(WARP_SIZE, nwarps, 1);
  3516. if (nrows_x % mmq_y == 0) {
  3517. const bool need_check = false;
  3518. mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
  3519. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3520. } else {
  3521. const bool need_check = true;
  3522. mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
  3523. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3524. }
  3525. }
  3526. #if defined(USE_ROCM)
  3527. #define MMQ_X_Q4_1 64
  3528. #define MMQ_Y_Q4_1 128
  3529. #define NWARPS_Q4_1 8
  3530. #else
  3531. #define MMQ_X_Q4_1 4
  3532. #define MMQ_Y_Q4_1 32
  3533. #define NWARPS_Q4_1 4
  3534. #endif
  3535. template <bool need_check> static __global__ void
  3536. #if defined(USE_ROCM)
  3537. __launch_bounds__(WARP_SIZE*NWARPS_Q4_1, 2)
  3538. #endif
  3539. mul_mat_q4_1(
  3540. const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
  3541. const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
  3542. const int mmq_x = MMQ_X_Q4_1;
  3543. const int mmq_y = MMQ_Y_Q4_1;
  3544. const int nwarps = NWARPS_Q4_1;
  3545. mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
  3546. load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
  3547. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3548. }
  3549. static void ggml_mul_mat_q4_1_q8_1_cuda(
  3550. const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
  3551. const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
  3552. int mmq_x = MMQ_X_Q4_1;
  3553. int mmq_y = MMQ_Y_Q4_1;
  3554. int nwarps = NWARPS_Q4_1;
  3555. const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
  3556. const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
  3557. const dim3 block_nums(block_num_x, block_num_y, 1);
  3558. const dim3 block_dims(WARP_SIZE, nwarps, 1);
  3559. if (nrows_x % mmq_y == 0) {
  3560. const bool need_check = false;
  3561. mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
  3562. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3563. } else {
  3564. const bool need_check = true;
  3565. mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
  3566. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3567. }
  3568. }
  3569. #if defined(USE_ROCM)
  3570. #define MMQ_X_Q5_0 64
  3571. #define MMQ_Y_Q5_0 128
  3572. #define NWARPS_Q5_0 8
  3573. #else
  3574. #define MMQ_X_Q5_0 4
  3575. #define MMQ_Y_Q5_0 32
  3576. #define NWARPS_Q5_0 4
  3577. #endif
  3578. template <bool need_check> static __global__ void
  3579. #if defined(USE_ROCM)
  3580. __launch_bounds__(WARP_SIZE*NWARPS_Q5_0, 2)
  3581. #endif
  3582. mul_mat_q5_0(
  3583. const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
  3584. const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
  3585. const int mmq_x = MMQ_X_Q5_0;
  3586. const int mmq_y = MMQ_Y_Q5_0;
  3587. const int nwarps = NWARPS_Q5_0;
  3588. mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
  3589. load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
  3590. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3591. }
  3592. static void ggml_mul_mat_q5_0_q8_1_cuda(
  3593. const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
  3594. const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
  3595. const int mmq_x = MMQ_X_Q5_0;
  3596. const int mmq_y = MMQ_Y_Q5_0;
  3597. const int nwarps = NWARPS_Q5_0;
  3598. const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
  3599. const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
  3600. const dim3 block_nums(block_num_x, block_num_y, 1);
  3601. const dim3 block_dims(WARP_SIZE, nwarps, 1);
  3602. if (nrows_x % mmq_y == 0) {
  3603. const bool need_check = false;
  3604. mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
  3605. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3606. } else {
  3607. const bool need_check = true;
  3608. mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
  3609. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3610. }
  3611. }
  3612. #if defined(USE_ROCM)
  3613. #define MMQ_X_Q5_1 64
  3614. #define MMQ_Y_Q5_1 128
  3615. #define NWARPS_Q5_1 8
  3616. #else
  3617. #define MMQ_X_Q5_1 4
  3618. #define MMQ_Y_Q5_1 32
  3619. #define NWARPS_Q5_1 4
  3620. #endif
  3621. template <bool need_check> static __global__ void
  3622. #if defined(USE_ROCM)
  3623. __launch_bounds__(WARP_SIZE*NWARPS_Q5_1, 2)
  3624. #endif
  3625. mul_mat_q5_1(
  3626. const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
  3627. const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
  3628. const int mmq_x = MMQ_X_Q5_1;
  3629. const int mmq_y = MMQ_Y_Q5_1;
  3630. const int nwarps = NWARPS_Q5_1;
  3631. mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
  3632. load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
  3633. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3634. }
  3635. static void ggml_mul_mat_q5_1_q8_1_cuda(
  3636. const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
  3637. const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
  3638. const int mmq_x = MMQ_X_Q5_1;
  3639. const int mmq_y = MMQ_Y_Q5_1;
  3640. const int nwarps = NWARPS_Q5_1;
  3641. const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
  3642. const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
  3643. const dim3 block_nums(block_num_x, block_num_y, 1);
  3644. const dim3 block_dims(WARP_SIZE, nwarps, 1);
  3645. if (nrows_x % mmq_y == 0) {
  3646. const bool need_check = false;
  3647. mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
  3648. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3649. } else {
  3650. const bool need_check = true;
  3651. mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
  3652. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3653. }
  3654. }
  3655. #if defined(USE_ROCM)
  3656. #define MMQ_X_Q8_0 64
  3657. #define MMQ_Y_Q8_0 128
  3658. #define NWARPS_Q8_0 8
  3659. #else
  3660. #define MMQ_X_Q8_0 4
  3661. #define MMQ_Y_Q8_0 32
  3662. #define NWARPS_Q8_0 4
  3663. #endif
  3664. template <bool need_check> static __global__ void
  3665. #if defined(USE_ROCM)
  3666. __launch_bounds__(WARP_SIZE*NWARPS_Q8_0, 2)
  3667. #endif
  3668. mul_mat_q8_0(
  3669. const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
  3670. const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
  3671. const int mmq_x = MMQ_X_Q8_0;
  3672. const int mmq_y = MMQ_Y_Q8_0;
  3673. const int nwarps = NWARPS_Q8_0;
  3674. mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
  3675. load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
  3676. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3677. }
  3678. static void ggml_mul_mat_q8_0_q8_1_cuda(
  3679. const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
  3680. const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
  3681. const int mmq_x = MMQ_X_Q8_0;
  3682. const int mmq_y = MMQ_Y_Q8_0;
  3683. const int nwarps = NWARPS_Q8_0;
  3684. const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
  3685. const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
  3686. const dim3 block_nums(block_num_x, block_num_y, 1);
  3687. const dim3 block_dims(WARP_SIZE, nwarps, 1);
  3688. if (nrows_x % mmq_y == 0) {
  3689. const bool need_check = false;
  3690. mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
  3691. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3692. } else {
  3693. const bool need_check = true;
  3694. mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
  3695. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3696. }
  3697. }
  3698. #if defined(USE_ROCM)
  3699. #define MMQ_X_Q2_K 64
  3700. #define MMQ_Y_Q2_K 128
  3701. #define NWARPS_Q2_K 8
  3702. #else
  3703. #define MMQ_X_Q2_K 4
  3704. #define MMQ_Y_Q2_K 32
  3705. #define NWARPS_Q2_K 4
  3706. #endif
  3707. template <bool need_check> static __global__ void
  3708. #if defined(USE_ROCM)
  3709. __launch_bounds__(WARP_SIZE*NWARPS_Q2_K, 2)
  3710. #endif
  3711. mul_mat_q2_K(
  3712. const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
  3713. const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
  3714. const int mmq_x = MMQ_X_Q2_K;
  3715. const int mmq_y = MMQ_Y_Q2_K;
  3716. const int nwarps = NWARPS_Q2_K;
  3717. mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
  3718. load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
  3719. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3720. }
  3721. static void ggml_mul_mat_q2_K_q8_1_cuda(
  3722. const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
  3723. const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
  3724. const int mmq_x = MMQ_X_Q2_K;
  3725. const int mmq_y = MMQ_Y_Q2_K;
  3726. const int nwarps = NWARPS_Q2_K;
  3727. const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
  3728. const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
  3729. const dim3 block_nums(block_num_x, block_num_y, 1);
  3730. const dim3 block_dims(WARP_SIZE, nwarps, 1);
  3731. if (nrows_x % mmq_y == 0) {
  3732. const bool need_check = false;
  3733. mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
  3734. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3735. } else {
  3736. const bool need_check = true;
  3737. mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
  3738. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3739. }
  3740. }
  3741. #if defined(USE_ROCM)
  3742. #define MMQ_X_Q3_K 64
  3743. #define MMQ_Y_Q3_K 128
  3744. #define NWARPS_Q3_K 8
  3745. #else
  3746. #define MMQ_X_Q3_K 4
  3747. #define MMQ_Y_Q3_K 32
  3748. #define NWARPS_Q3_K 4
  3749. #endif
  3750. template <bool need_check> static __global__ void
  3751. #if defined(USE_ROCM)
  3752. __launch_bounds__(WARP_SIZE*NWARPS_Q3_K, 2)
  3753. #endif
  3754. mul_mat_q3_K(
  3755. const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
  3756. const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
  3757. const int mmq_x = MMQ_X_Q3_K;
  3758. const int mmq_y = MMQ_Y_Q3_K;
  3759. const int nwarps = NWARPS_Q3_K;
  3760. mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
  3761. load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
  3762. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3763. }
  3764. static void ggml_mul_mat_q3_K_q8_1_cuda(
  3765. const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
  3766. const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
  3767. const int mmq_x = MMQ_X_Q3_K;
  3768. const int mmq_y = MMQ_Y_Q3_K;
  3769. const int nwarps = NWARPS_Q3_K;
  3770. const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
  3771. const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
  3772. const dim3 block_nums(block_num_x, block_num_y, 1);
  3773. const dim3 block_dims(WARP_SIZE, nwarps, 1);
  3774. if (nrows_x % mmq_y == 0) {
  3775. const bool need_check = false;
  3776. mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
  3777. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3778. } else {
  3779. const bool need_check = true;
  3780. mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
  3781. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3782. }
  3783. }
  3784. #if defined(USE_ROCM)
  3785. #define MMQ_X_Q4_K 64
  3786. #define MMQ_Y_Q4_K 128
  3787. #define NWARPS_Q4_K 8
  3788. #else
  3789. #define MMQ_X_Q4_K 4
  3790. #define MMQ_Y_Q4_K 32
  3791. #define NWARPS_Q4_K 4
  3792. #endif
  3793. template <bool need_check> static __global__ void
  3794. #if defined(USE_ROCM)
  3795. __launch_bounds__(WARP_SIZE*NWARPS_Q4_K, 2)
  3796. #endif
  3797. mul_mat_q4_K(
  3798. const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
  3799. const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
  3800. const int mmq_x = MMQ_X_Q4_K;
  3801. const int mmq_y = MMQ_Y_Q4_K;
  3802. const int nwarps = NWARPS_Q4_K;
  3803. mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
  3804. load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
  3805. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3806. }
  3807. static void ggml_mul_mat_q4_K_q8_1_cuda(
  3808. const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
  3809. const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
  3810. const int mmq_x = MMQ_X_Q4_K;
  3811. const int mmq_y = MMQ_Y_Q4_K;
  3812. const int nwarps = NWARPS_Q4_K;
  3813. const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
  3814. const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
  3815. const dim3 block_nums(block_num_x, block_num_y, 1);
  3816. const dim3 block_dims(WARP_SIZE, nwarps, 1);
  3817. if (nrows_x % mmq_y == 0) {
  3818. const bool need_check = false;
  3819. mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
  3820. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3821. } else {
  3822. const bool need_check = true;
  3823. mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
  3824. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3825. }
  3826. }
  3827. #if defined(USE_ROCM)
  3828. #define MMQ_X_Q5_K 64
  3829. #define MMQ_Y_Q5_K 128
  3830. #define NWARPS_Q5_K 8
  3831. #else
  3832. #define MMQ_X_Q5_K 4
  3833. #define MMQ_Y_Q5_K 32
  3834. #define NWARPS_Q5_K 4
  3835. #endif
  3836. template <bool need_check> static __global__ void
  3837. #if defined(USE_ROCM)
  3838. __launch_bounds__(WARP_SIZE*NWARPS_Q5_K, 2)
  3839. #endif
  3840. mul_mat_q5_K(
  3841. const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
  3842. const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
  3843. const int mmq_x = MMQ_X_Q5_K;
  3844. const int mmq_y = MMQ_Y_Q5_K;
  3845. const int nwarps = NWARPS_Q5_K;
  3846. mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
  3847. load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
  3848. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3849. }
  3850. static void ggml_mul_mat_q5_K_q8_1_cuda(
  3851. const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
  3852. const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
  3853. const int mmq_x = MMQ_X_Q5_K;
  3854. const int mmq_y = MMQ_Y_Q5_K;
  3855. const int nwarps = NWARPS_Q5_K;
  3856. const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
  3857. const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
  3858. const dim3 block_nums(block_num_x, block_num_y, 1);
  3859. const dim3 block_dims(WARP_SIZE, nwarps, 1);
  3860. if (nrows_x % mmq_y == 0) {
  3861. const bool need_check = false;
  3862. mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
  3863. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3864. } else {
  3865. const bool need_check = true;
  3866. mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
  3867. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3868. }
  3869. }
  3870. #if defined(USE_ROCM)
  3871. #define MMQ_X_Q6_K 64
  3872. #define MMQ_Y_Q6_K 128
  3873. #define NWARPS_Q6_K 8
  3874. #else
  3875. #define MMQ_X_Q6_K 4
  3876. #define MMQ_Y_Q6_K 32
  3877. #define NWARPS_Q6_K 4
  3878. #endif
  3879. template <bool need_check> static __global__ void
  3880. #if defined(USE_ROCM)
  3881. __launch_bounds__(WARP_SIZE*NWARPS_Q6_K, 2)
  3882. #endif
  3883. mul_mat_q6_K(
  3884. const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
  3885. const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
  3886. const int mmq_x = MMQ_X_Q6_K;
  3887. const int mmq_y = MMQ_Y_Q6_K;
  3888. const int nwarps = NWARPS_Q6_K;
  3889. mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
  3890. load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
  3891. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3892. }
  3893. static void ggml_mul_mat_q6_K_q8_1_cuda(
  3894. const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
  3895. const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
  3896. const int mmq_x = MMQ_X_Q6_K;
  3897. const int mmq_y = MMQ_Y_Q6_K;
  3898. const int nwarps = NWARPS_Q6_K;
  3899. const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
  3900. const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
  3901. const dim3 block_nums(block_num_x, block_num_y, 1);
  3902. const dim3 block_dims(WARP_SIZE, nwarps, 1);
  3903. if (nrows_x % mmq_y == 0) {
  3904. const bool need_check = false;
  3905. mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
  3906. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3907. } else {
  3908. const bool need_check = true;
  3909. mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
  3910. (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
  3911. }
  3912. }
  3913. torch::Tensor ggml_dequantize(
  3914. torch::Tensor W, // quant weight
  3915. int8_t type,
  3916. int64_t m,
  3917. int64_t n
  3918. ){
  3919. const at::cuda::OptionalCUDAGuard device_guard(device_of(W));
  3920. auto options = torch::TensorOptions().dtype(torch::kFloat16).device(W.device());
  3921. at::Tensor DW = torch::empty({m, n}, options);
  3922. cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
  3923. const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(type);
  3924. to_fp16_cuda(
  3925. (void*)W.data_ptr(), (half*)DW.data_ptr(), m * n, stream
  3926. );
  3927. return DW;
  3928. }
  3929. // New quantizations doesn't implement ggml_mul_mat_vec and only use ggml_mul_mat_vec_a8
  3930. torch::Tensor ggml_mul_mat_vec(
  3931. torch::Tensor W, // quant weight
  3932. torch::Tensor X, // input
  3933. int8_t type,
  3934. int64_t row
  3935. ){
  3936. size_t col = X.sizes()[1];
  3937. const at::cuda::OptionalCUDAGuard device_guard(device_of(X));
  3938. auto options = torch::TensorOptions().dtype(torch::kFloat16).device(W.device());
  3939. at::Tensor Y = torch::empty({1, row}, options);
  3940. cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
  3941. switch (type) {
  3942. case 2:
  3943. dequantize_mul_mat_vec_q4_0_cuda((void*)W.data_ptr(), (half*)X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  3944. break;
  3945. case 3:
  3946. dequantize_mul_mat_vec_q4_1_cuda((void*)W.data_ptr(), (half*)X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  3947. break;
  3948. case 6:
  3949. dequantize_mul_mat_vec_q5_0_cuda((void*)W.data_ptr(), (half*)X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  3950. break;
  3951. case 7:
  3952. dequantize_mul_mat_vec_q5_1_cuda((void*)W.data_ptr(), (half*)X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  3953. break;
  3954. case 8:
  3955. dequantize_mul_mat_vec_q8_0_cuda((void*)W.data_ptr(), (half*)X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  3956. break;
  3957. case 10:
  3958. dequantize_mul_mat_vec_q2_K_cuda((void*)W.data_ptr(), (half*)X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  3959. break;
  3960. case 11:
  3961. dequantize_mul_mat_vec_q3_K_cuda((void*)W.data_ptr(), (half*)X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  3962. break;
  3963. case 12:
  3964. dequantize_mul_mat_vec_q4_K_cuda((void*)W.data_ptr(), (half*)X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  3965. break;
  3966. case 13:
  3967. dequantize_mul_mat_vec_q5_K_cuda((void*)W.data_ptr(), (half*)X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  3968. break;
  3969. case 14:
  3970. dequantize_mul_mat_vec_q6_K_cuda((void*)W.data_ptr(), (half*)X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  3971. break;
  3972. case 16:
  3973. dequantize_mul_mat_vec_iq2_xxs_cuda((void*)W.data_ptr(), (half*)X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  3974. break;
  3975. case 17:
  3976. dequantize_mul_mat_vec_iq2_xs_cuda((void*)W.data_ptr(), (half*)X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  3977. break;
  3978. }
  3979. return Y;
  3980. }
  3981. torch::Tensor ggml_mul_mat_vec_a8(
  3982. torch::Tensor W, // quant weight
  3983. torch::Tensor X, // input
  3984. int8_t type,
  3985. int64_t row
  3986. ){
  3987. int col = X.sizes()[1];
  3988. const int padded = (col + 512 - 1) / 512 * 512;
  3989. const at::cuda::OptionalCUDAGuard device_guard(device_of(X));
  3990. auto options = torch::TensorOptions().dtype(torch::kFloat16).device(W.device());
  3991. at::Tensor Y = torch::empty({1, row}, options);
  3992. cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
  3993. options = torch::TensorOptions().dtype(torch::kInt32).device(W.device());
  3994. at::Tensor quant_X = torch::empty({1, padded / 32 * 9}, options);
  3995. quantize_row_q8_1_cuda((half*)X.data_ptr(), (void*)quant_X.data_ptr(), col, 1, stream);
  3996. switch (type) {
  3997. case 2:
  3998. mul_mat_vec_q4_0_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  3999. break;
  4000. case 3:
  4001. mul_mat_vec_q4_1_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  4002. break;
  4003. case 6:
  4004. mul_mat_vec_q5_0_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  4005. break;
  4006. case 7:
  4007. mul_mat_vec_q5_1_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  4008. break;
  4009. case 8:
  4010. mul_mat_vec_q8_0_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  4011. break;
  4012. case 10:
  4013. mul_mat_vec_q2_K_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  4014. break;
  4015. case 11:
  4016. mul_mat_vec_q3_K_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  4017. break;
  4018. case 12:
  4019. mul_mat_vec_q4_K_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  4020. break;
  4021. case 13:
  4022. mul_mat_vec_q5_K_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  4023. break;
  4024. case 14:
  4025. mul_mat_vec_q6_K_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  4026. break;
  4027. case 16:
  4028. mul_mat_vec_iq2_xxs_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  4029. break;
  4030. case 17:
  4031. mul_mat_vec_iq2_xs_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  4032. break;
  4033. case 18:
  4034. mul_mat_vec_iq3_xxs_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  4035. break;
  4036. case 19:
  4037. mul_mat_vec_iq1_s_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  4038. break;
  4039. case 20:
  4040. mul_mat_vec_iq4_nl_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  4041. break;
  4042. case 21:
  4043. mul_mat_vec_iq3_s_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  4044. break;
  4045. case 22:
  4046. mul_mat_vec_iq2_s_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  4047. break;
  4048. case 23:
  4049. mul_mat_vec_iq4_xs_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, stream);
  4050. break;
  4051. }
  4052. return Y;
  4053. }
  4054. torch::Tensor ggml_mul_mat_a8(
  4055. torch::Tensor W, // quant weight
  4056. torch::Tensor X, // input
  4057. int8_t type,
  4058. int64_t row
  4059. ) {
  4060. int col = X.sizes()[1];
  4061. int padded = (col + 512 - 1) / 512 * 512;
  4062. int batch = X.sizes()[0];
  4063. const at::cuda::OptionalCUDAGuard device_guard(device_of(X));
  4064. auto options = torch::TensorOptions().dtype(torch::kFloat16).device(W.device());
  4065. at::Tensor Y = torch::empty({batch, row}, options);
  4066. cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
  4067. options = torch::TensorOptions().dtype(torch::kInt32).device(W.device());
  4068. at::Tensor quant_X = torch::empty({batch, padded / 32 * 9}, options);
  4069. quantize_row_q8_1_cuda((half*)X.data_ptr(), (void*)quant_X.data_ptr(), col, batch, stream);
  4070. switch (type) {
  4071. case 2:
  4072. ggml_mul_mat_q4_0_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, batch, padded, row, stream);
  4073. break;
  4074. case 3:
  4075. ggml_mul_mat_q4_1_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, batch, padded, row, stream);
  4076. break;
  4077. case 6:
  4078. ggml_mul_mat_q5_0_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, batch, padded, row, stream);
  4079. break;
  4080. case 7:
  4081. ggml_mul_mat_q5_1_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, batch, padded, row, stream);
  4082. break;
  4083. case 8:
  4084. ggml_mul_mat_q8_0_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, batch, padded, row, stream);
  4085. break;
  4086. case 10:
  4087. ggml_mul_mat_q2_K_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, batch, padded, row, stream);
  4088. break;
  4089. case 11:
  4090. ggml_mul_mat_q3_K_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, batch, padded, row, stream);
  4091. break;
  4092. case 12:
  4093. ggml_mul_mat_q4_K_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, batch, padded, row, stream);
  4094. break;
  4095. case 13:
  4096. ggml_mul_mat_q5_K_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, batch, padded, row, stream);
  4097. break;
  4098. case 14:
  4099. ggml_mul_mat_q6_K_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(), col, row, batch, padded, row, stream);
  4100. break;
  4101. }
  4102. return Y;
  4103. }