test_flash_attn.py 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760
  1. import math
  2. import pytest
  3. import torch
  4. import torch.nn.functional as F
  5. from einops import rearrange, repeat
  6. from flash_attn_interface import flash_attn_func, flash_attn_varlen_func, _flash_attn_forward
  7. from tests.test_util import generate_random_padding_mask, generate_qkv, construct_local_mask, attention_ref
  8. ABS_TOL = 5e-3
  9. REL_TOL = 1e-1
  10. def print_diffs(out, out_ref):
  11. out_1d = out.flatten()
  12. out_ref_1d = out_ref.flatten()
  13. for idx, (e_o, e_o_ref) in enumerate(zip(out_1d, out_ref_1d)):
  14. diff = e_o - e_o_ref
  15. abs_diff = abs(diff)
  16. abs_ref = abs(e_o_ref + 1e-5)
  17. relative_diff = abs_diff / abs_ref
  18. if abs_diff > ABS_TOL or relative_diff > REL_TOL:
  19. print(f"==== diff ==== {idx}, test: {e_o}, ref: {e_o_ref}")
  20. @pytest.mark.parametrize("dtype", [torch.float8_e4m3fn])
  21. # @pytest.mark.parametrize("mha_type", ["mha", "mqa", "gqa"])
  22. @pytest.mark.parametrize("mha_type", ["mha"])
  23. # @pytest.mark.parametrize("causal", [False, True])
  24. @pytest.mark.parametrize("causal", [False])
  25. # @pytest.mark.parametrize("local", [False, True])
  26. @pytest.mark.parametrize("local", [False])
  27. @pytest.mark.parametrize("deterministic", [True])
  28. # @pytest.mark.parametrize("gqa_parallel", [False, True])
  29. @pytest.mark.parametrize("gqa_parallel", [False])
  30. # @pytest.mark.parametrize("d", [64, 128, 256])
  31. @pytest.mark.parametrize("d", [128])
  32. @pytest.mark.parametrize("descale", [1.0])
  33. # @pytest.mark.parametrize("descale", [1.0, 2.0, 3.0])
  34. @pytest.mark.parametrize(
  35. "seqlen_q,seqlen_k",
  36. [
  37. # (1, 1),
  38. # (64, 128),
  39. # (128, 128),
  40. # (256, 256),
  41. # (113, 203),
  42. # (128, 217),
  43. # (113, 211),
  44. # (108, 256),
  45. # (256, 512),
  46. # (384, 256),
  47. # (640, 128),
  48. # (512, 256),
  49. (1024, 1024),
  50. (1023, 1024),
  51. (1024, 1023),
  52. (4096, 4096),
  53. (4224, 4224),
  54. ],
  55. )
  56. def test_flash_attn_output_fp8(
  57. seqlen_q, seqlen_k, d, causal, local, deterministic, mha_type, dtype, descale, gqa_parallel
  58. ):
  59. device = "cuda"
  60. dtype_init = torch.bfloat16
  61. print(dtype)
  62. print('causal',causal)
  63. print('local',local)
  64. print('gqa_parallel',gqa_parallel)
  65. # set seed
  66. torch.random.manual_seed(42)
  67. # batch_size = 40
  68. # nheads = 16
  69. batch_size = 4
  70. nheads = 6
  71. nheads_kv = 6 if mha_type == "mha" else (2 if mha_type == "gqa" else 1)
  72. # nheads_kv = 1
  73. # batch_size = 9
  74. # nheads = 6
  75. window_size = (-1, -1) if not local else torch.randint(0, seqlen_k, (2,))
  76. q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype_init, requires_grad=True)
  77. k = torch.randn(batch_size, seqlen_k, nheads_kv, d, device=device, dtype=dtype_init, requires_grad=True)
  78. v = torch.randn(batch_size, seqlen_k, nheads_kv, d, device=device, dtype=dtype_init, requires_grad=True)
  79. q = q.to(dtype)
  80. k = k.to(dtype)
  81. v = v.to(dtype)
  82. softmax_scale = q.shape[-1] ** (-0.5)
  83. descale_q = torch.tensor([descale], dtype=torch.float32, device='cuda')
  84. descale_k = torch.tensor([descale], dtype=torch.float32, device='cuda')
  85. descale_v = torch.tensor([descale], dtype=torch.float32, device='cuda')
  86. out, lse = flash_attn_func(q, k, v, causal=causal, window_size=window_size, deterministic=deterministic, gqa_parallel=gqa_parallel,
  87. descale_q=descale_q, descale_k=descale_k, descale_v=descale_v)
  88. q = q.to(dtype_init)
  89. k = k.to(dtype_init)
  90. v = v.to(dtype_init)
  91. descale_q = descale_q.to(dtype_init)
  92. descale_k = descale_k.to(dtype_init)
  93. descale_v = descale_v.to(dtype_init)
  94. q = q * descale_q
  95. k = k * descale_k
  96. v = v * descale_v
  97. out_ref, attn_ref = attention_ref(
  98. q,
  99. k,
  100. v,
  101. None,
  102. None,
  103. causal=causal,
  104. window_size=window_size,
  105. )
  106. out_pt, attn_pt = attention_ref(
  107. q,
  108. k,
  109. v,
  110. None,
  111. None,
  112. causal=causal,
  113. window_size=window_size,
  114. upcast=False,
  115. reorder_ops=True,
  116. )
  117. # qk = torch.einsum('bshd,bthd->bhst', q, k).float()
  118. # m = qk.amax(-1, keepdim=True)
  119. # s_tmp = torch.exp((qk - m) / math.sqrt(d))
  120. # exp_sum = s_tmp.sum(-1)
  121. # qk = torch.einsum('bthd,bshd->bhts', q.float() / math.sqrt(d), k.float())
  122. # lse_ref = torch.logsumexp(qk, dim=-1)
  123. print(f"Output max diff: {(out - out_ref).abs().max().item()}")
  124. print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
  125. print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}")
  126. print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}")
  127. # if not causal:
  128. # print(f"LSE max diff: {(lse - lse_ref).abs().max().item()}")
  129. # breakpoint()
  130. # dS = torch.einsum('bthd,bshd->bhts', g.float(), v.float())
  131. # P = torch.softmax(qk, -1)
  132. # dP = P * (dS - do_o.unsqueeze(1))
  133. # dQ = torch.einsum('bhts,bshd->bthd', dP, k.float())
  134. # dV = torch.einsum('bhts,bthd->bshd', P, g.float())
  135. # dK = torch.einsum('bhts,bthd->bshd', dP, q.float())
  136. # breakpoint()
  137. # assert (out - out_ref).abs().max().item() <= 4 * (out_pt - out_ref).abs().max().item() + 1e-2
  138. atol = 4 * (out_pt - out_ref).abs().max().item() + 1e-2
  139. torch.testing.assert_close(out, out_ref, rtol=1e-2, atol=atol, check_dtype=False)
  140. @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
  141. # @pytest.mark.parametrize("dtype", [torch.float8_e4m3fn])
  142. @pytest.mark.parametrize("mha_type", ["mha", "mqa", "gqa"])
  143. # @pytest.mark.parametrize("mha_type", ["mha"])
  144. @pytest.mark.parametrize("causal", [False, True])
  145. # @pytest.mark.parametrize("causal", [False])
  146. @pytest.mark.parametrize("local", [False, True])
  147. # @pytest.mark.parametrize("local", [True])
  148. @pytest.mark.parametrize("deterministic", [False, True])
  149. # @pytest.mark.parametrize("deterministic", [True])
  150. @pytest.mark.parametrize("gqa_parallel", [False, True])
  151. # @pytest.mark.parametrize("gqa_parallel", [False])
  152. # @pytest.mark.parametrize("d", [32, 64, 96, 128, 160, 192, 224, 256])
  153. # @pytest.mark.parametrize('d', [32, 40, 64, 80, 96, 128, 160, 192])
  154. # @pytest.mark.parametrize('d', [32, 64, 96, 128, 160, 192])
  155. # @pytest.mark.parametrize('d', [56, 80])
  156. # @pytest.mark.parametrize("d", [64, 128, 256])
  157. # @pytest.mark.parametrize("d", [64, 96, 128])
  158. # @pytest.mark.parametrize("d", [64])
  159. @pytest.mark.parametrize("d", [64, 128, 256])
  160. @pytest.mark.parametrize("descale", [1.0])
  161. # @pytest.mark.parametrize("descale", [1.0, 2.0, 3.0, 4.0])
  162. @pytest.mark.parametrize(
  163. "seqlen_q,seqlen_k",
  164. [
  165. (1, 1),
  166. (64, 128),
  167. (128, 128),
  168. (256, 256),
  169. (113, 203),
  170. (128, 217),
  171. (113, 211),
  172. (108, 256),
  173. (256, 512),
  174. (384, 256),
  175. (640, 128),
  176. (512, 256),
  177. (1024, 1024),
  178. (1023, 1024),
  179. (1024, 1023),
  180. (4096, 4096),
  181. (4224, 4224),
  182. ],
  183. )
  184. # @pytest.mark.parametrize('seqlen_q,seqlen_k', [(128, 128)])
  185. def test_flash_attn_output(
  186. seqlen_q, seqlen_k, d, causal, local, deterministic, mha_type, dtype, descale, gqa_parallel
  187. ):
  188. device = "cuda"
  189. if(dtype == torch.float8_e4m3fn):
  190. dtype_init = torch.bfloat16
  191. else:
  192. dtype_init = dtype
  193. print(dtype)
  194. print('causal',causal)
  195. print('local',local)
  196. print('gqa_parallel',gqa_parallel)
  197. # set seed
  198. torch.random.manual_seed(42)
  199. # batch_size = 40
  200. # nheads = 16
  201. batch_size = 4
  202. nheads = 6
  203. nheads_kv = 6 if mha_type == "mha" else (2 if mha_type == "gqa" else 1)
  204. # nheads_kv = 1
  205. # batch_size = 9
  206. # nheads = 6
  207. window_size = (-1, -1) if not local else torch.randint(0, seqlen_k, (2,))
  208. q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype_init, requires_grad=True)
  209. k = torch.randn(batch_size, seqlen_k, nheads_kv, d, device=device, dtype=dtype_init, requires_grad=True)
  210. v = torch.randn(batch_size, seqlen_k, nheads_kv, d, device=device, dtype=dtype_init, requires_grad=True)
  211. q = q.to(dtype)
  212. k = k.to(dtype)
  213. v = v.to(dtype)
  214. softmax_scale = q.shape[-1] ** (-0.5)
  215. descale_q = torch.tensor([descale], dtype=torch.float32, device='cuda')
  216. descale_k = torch.tensor([descale], dtype=torch.float32, device='cuda')
  217. descale_v = torch.tensor([descale], dtype=torch.float32, device='cuda')
  218. if(dtype != torch.float8_e4m3fn):
  219. out, lse = flash_attn_func(q, k, v, causal=causal, window_size=window_size, deterministic=deterministic, gqa_parallel=gqa_parallel)
  220. else:
  221. out, lse = flash_attn_func(q, k, v, causal=causal, window_size=window_size, deterministic=deterministic, gqa_parallel=gqa_parallel,
  222. descale_q=descale_q, descale_k=descale_k, descale_v=descale_v)
  223. q = q.to(dtype_init)
  224. k = k.to(dtype_init)
  225. v = v.to(dtype_init)
  226. if(dtype == torch.float8_e4m3fn):
  227. descale_q = descale_q.to(dtype_init)
  228. descale_k = descale_k.to(dtype_init)
  229. descale_v = descale_v.to(dtype_init)
  230. q = q * descale_q
  231. k = k * descale_k
  232. v = v * descale_v
  233. out_ref, attn_ref = attention_ref(
  234. q,
  235. k,
  236. v,
  237. None,
  238. None,
  239. causal=causal,
  240. window_size=window_size,
  241. )
  242. out_pt, attn_pt = attention_ref(
  243. q,
  244. k,
  245. v,
  246. None,
  247. None,
  248. causal=causal,
  249. window_size=window_size,
  250. upcast=False,
  251. reorder_ops=True,
  252. )
  253. # qk = torch.einsum('bshd,bthd->bhst', q, k).float()
  254. # m = qk.amax(-1, keepdim=True)
  255. # s_tmp = torch.exp((qk - m) / math.sqrt(d))
  256. # exp_sum = s_tmp.sum(-1)
  257. # qk = torch.einsum('bthd,bshd->bhts', q.float() / math.sqrt(d), k.float())
  258. # lse_ref = torch.logsumexp(qk, dim=-1)
  259. print(f"Output max diff: {(out - out_ref).abs().max().item()}")
  260. print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
  261. print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}")
  262. print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}")
  263. # if not causal:
  264. # print(f"LSE max diff: {(lse - lse_ref).abs().max().item()}")
  265. # breakpoint()
  266. if d <= 128 and dtype != torch.float8_e4m3fn:
  267. g = torch.randn_like(out)
  268. do_o = (g.float() * out.float()).sum(-1)
  269. dq, dk, dv = torch.autograd.grad(out, (q, k, v), g)
  270. dq_ref, dk_ref, dv_ref = torch.autograd.grad(out_ref, (q, k, v), g)
  271. dq_pt, dk_pt, dv_pt = torch.autograd.grad(out_pt, (q, k, v), g)
  272. print(f"dQ max diff: {(dq - dq_ref).abs().max().item()}")
  273. print(f"dK max diff: {(dk - dk_ref).abs().max().item()}")
  274. print(f"dV max diff: {(dv - dv_ref).abs().max().item()}")
  275. print(f"dQ mean diff: {(dq - dq_ref).abs().mean().item()}")
  276. print(f"dK mean diff: {(dk - dk_ref).abs().mean().item()}")
  277. print(f"dV mean diff: {(dv - dv_ref).abs().mean().item()}")
  278. print(f"dQ Pytorch max diff: {(dq_pt - dq_ref).abs().max().item()}")
  279. print(f"dK Pytorch max diff: {(dk_pt - dk_ref).abs().max().item()}")
  280. print(f"dV Pytorch max diff: {(dv_pt - dv_ref).abs().max().item()}")
  281. print(f"dQ Pytorch mean diff: {(dq_pt - dq_ref).abs().mean().item()}")
  282. print(f"dK Pytorch mean diff: {(dk_pt - dk_ref).abs().mean().item()}")
  283. print(f"dV Pytorch mean diff: {(dv_pt - dv_ref).abs().mean().item()}")
  284. # dS = torch.einsum('bthd,bshd->bhts', g.float(), v.float())
  285. # P = torch.softmax(qk, -1)
  286. # dP = P * (dS - do_o.unsqueeze(1))
  287. # dQ = torch.einsum('bhts,bshd->bthd', dP, k.float())
  288. # dV = torch.einsum('bhts,bthd->bshd', P, g.float())
  289. # dK = torch.einsum('bhts,bthd->bshd', dP, q.float())
  290. # breakpoint()
  291. # Check that FlashAttention's numerical error is at most twice the numerical error
  292. # of a Pytorch implementation.
  293. # breakpoint()
  294. if(dtype != torch.float8_e4m3fn):
  295. assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item() + 3e-5
  296. else:
  297. # just test correctness of fp8 kernel w/o further quantization techniques
  298. assert (out - out_ref).abs().max().item() <= 4 * (out_pt - out_ref).abs().max().item() + 2e-2
  299. if d <= 128 and dtype != torch.float8_e4m3fn:
  300. assert (dq - dq_ref).abs().max().item() <= 2 * (dq_pt - dq_ref).abs().max().item() + 3e-5
  301. assert (dk - dk_ref).abs().max().item() <= 2 * (dk_pt - dk_ref).abs().max().item() + 3e-5
  302. assert (dv - dv_ref).abs().max().item() <= 2 * (dv_pt - dv_ref).abs().max().item() + 3e-5
  303. # @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
  304. @pytest.mark.parametrize("dtype", [torch.float16])
  305. # @pytest.mark.parametrize("mha_type", ["mha", "mqa", "gqa"])
  306. @pytest.mark.parametrize("mha_type", ["mha"])
  307. # @pytest.mark.parametrize("causal", [False, True])
  308. @pytest.mark.parametrize("causal", [False])
  309. # @pytest.mark.parametrize("local", [False, True])
  310. @pytest.mark.parametrize("local", [False])
  311. # @pytest.mark.parametrize("deterministic", [False, True])
  312. @pytest.mark.parametrize("deterministic", [True])
  313. # @pytest.mark.parametrize("add_unused_qkv", [False, True])
  314. @pytest.mark.parametrize("add_unused_qkv", [True])
  315. # @pytest.mark.parametrize("d", [32, 59, 64, 80, 96, 111, 128, 160, 192, 224, 256])
  316. # @pytest.mark.parametrize("d", [32, 64, 96, 128, 160, 192, 224, 256])
  317. # @pytest.mark.parametrize('d', [256])
  318. # @pytest.mark.parametrize("d", [64, 128, 256])
  319. # @pytest.mark.parametrize("d", [64, 128])
  320. @pytest.mark.parametrize("d", [128])
  321. @pytest.mark.parametrize(
  322. "seqlen_q,seqlen_k",
  323. [
  324. # (1, 1),
  325. # (1, 3),
  326. # (2, 1),
  327. # (511, 1),
  328. # (3, 513),
  329. # (64, 128),
  330. # (113, 203),
  331. # (128, 128),
  332. # (128, 217),
  333. # (113, 211),
  334. # (108, 256),
  335. # (256, 512),
  336. # (384, 256),
  337. # (512, 256),
  338. # (640, 128),
  339. # (1024, 1024),
  340. # (1023, 1024),
  341. # (1024, 1023),
  342. # (2048, 2048),
  343. (4096, 4096),
  344. ],
  345. )
  346. # @pytest.mark.parametrize('seqlen_q,seqlen_k', [(128, 128)])
  347. def test_flash_attn_varlen_output(
  348. seqlen_q, seqlen_k, d, causal, local, deterministic, add_unused_qkv, mha_type, dtype
  349. ):
  350. if (
  351. max(seqlen_q, seqlen_k) >= 2048
  352. and torch.cuda.get_device_properties("cuda").total_memory <= 16 * 2**30
  353. ):
  354. pytest.skip() # Reference implementation OOM
  355. device = "cuda"
  356. # set seed
  357. torch.random.manual_seed(0)
  358. # batch_size = 1
  359. # nheads = 1
  360. # nheads_kv = 1
  361. batch_size = 9
  362. nheads = 6
  363. nheads_kv = 6 if mha_type == "mha" else (2 if mha_type == "gqa" else 1)
  364. window_size = (-1, -1) if not local else torch.randint(0, seqlen_k, (2,))
  365. q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype, requires_grad=True)
  366. k = torch.randn(
  367. batch_size, seqlen_k, nheads_kv, d, device=device, dtype=dtype, requires_grad=True
  368. )
  369. v = torch.randn(
  370. batch_size, seqlen_k, nheads_kv, d, device=device, dtype=dtype, requires_grad=True
  371. )
  372. query_padding_mask = generate_random_padding_mask(seqlen_q, batch_size, device, mode="random", zero_lengths=False)
  373. key_padding_mask = generate_random_padding_mask(seqlen_k, batch_size, device, mode="random", zero_lengths=True)
  374. # key_padding_mask = generate_random_padding_mask(seqlen_k, batch_size, device, mode='full')
  375. def _gen_unused_masks(padding_mask, add_unused, max_seq_len, bs, device):
  376. if add_unused:
  377. another_mask = generate_random_padding_mask(max_seq_len, bs, device)
  378. attn_mask = torch.logical_and(padding_mask, another_mask)
  379. unused_mask = torch.logical_xor(torch.logical_or(padding_mask, another_mask), attn_mask)
  380. else:
  381. attn_mask = padding_mask
  382. unused_mask = None
  383. return attn_mask, unused_mask
  384. query_padding_mask, query_unused_mask = _gen_unused_masks(query_padding_mask, add_unused_qkv, seqlen_q, batch_size, q.device)
  385. key_padding_mask, key_unused_mask = _gen_unused_masks(key_padding_mask, add_unused_qkv, seqlen_k, batch_size, k.device)
  386. (
  387. q_unpad,
  388. k_unpad,
  389. v_unpad,
  390. cu_seqlens_q,
  391. cu_seqlens_k,
  392. seqused_q,
  393. seqused_k,
  394. max_seqlen_q,
  395. max_seqlen_k,
  396. q,
  397. k,
  398. v,
  399. output_pad_fn,
  400. dq_pad_fn,
  401. dk_pad_fn,
  402. ) = generate_qkv(q, k, v, query_padding_mask, key_padding_mask, kvpacked=False, query_unused_mask=query_unused_mask, key_unused_mask=key_unused_mask)
  403. # print("cu_seqlens_q: ", cu_seqlens_q)
  404. # print("cu_seqlens_k: ", cu_seqlens_k)
  405. # print("q_unpad, shape: ", q_unpad.shape)
  406. # print("k_unpad, shape: ", k_unpad.shape)
  407. # print("v_unpad, shape: ", v_unpad.shape)
  408. out_unpad, sm_lse = flash_attn_varlen_func(
  409. q_unpad,
  410. k_unpad,
  411. v_unpad,
  412. cu_seqlens_q,
  413. cu_seqlens_k,
  414. max_seqlen_q,
  415. max_seqlen_k,
  416. causal=causal,
  417. deterministic=deterministic,
  418. seqused_q=seqused_q,
  419. seqused_k=seqused_k,
  420. window_size=window_size,
  421. )
  422. out = output_pad_fn(out_unpad)
  423. if query_unused_mask is not None:
  424. q_zero_masking = rearrange(query_unused_mask, "b s -> b s 1 1")
  425. out.masked_fill_(q_zero_masking, 0.0)
  426. dropout_mask = None
  427. out_ref, attn_ref = attention_ref(
  428. q,
  429. k,
  430. v,
  431. query_padding_mask,
  432. key_padding_mask,
  433. causal=causal,
  434. window_size=window_size,
  435. )
  436. out_pt, attn_pt = attention_ref(
  437. q,
  438. k,
  439. v,
  440. query_padding_mask,
  441. key_padding_mask,
  442. causal=causal,
  443. window_size=window_size,
  444. upcast=False,
  445. reorder_ops=True,
  446. )
  447. print(f"Output max diff: {(out - out_ref).abs().max().item()}")
  448. print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
  449. print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}")
  450. print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}")
  451. g = torch.randn_like(out)
  452. if d <= 128:
  453. (
  454. dq_unpad,
  455. dk_unpad,
  456. dv_unpad,
  457. ) = torch.autograd.grad(out, (q_unpad, k_unpad, v_unpad), g)
  458. dk = dk_pad_fn(dk_unpad)
  459. dv = dk_pad_fn(dv_unpad)
  460. if key_unused_mask is not None:
  461. k_zero_masking = rearrange(key_unused_mask, "b s -> b s 1 1")
  462. dk.masked_fill_(k_zero_masking, 0.0)
  463. dv.masked_fill_(k_zero_masking, 0.0)
  464. (
  465. dq_ref,
  466. dk_ref,
  467. dv_ref,
  468. ) = torch.autograd.grad(out_ref, (q, k, v), g)
  469. zero_masking = rearrange(torch.logical_not(torch.any(key_padding_mask, 1)), "b -> b 1 1 1")
  470. dk_ref.masked_fill_(zero_masking, 0.0)
  471. dv_ref.masked_fill_(zero_masking, 0.0)
  472. (
  473. dq_pt,
  474. dk_pt,
  475. dv_pt,
  476. ) = torch.autograd.grad(out_pt, (q, k, v), g)
  477. dk_pt.masked_fill_(zero_masking, 0.0)
  478. dv_pt.masked_fill_(zero_masking, 0.0)
  479. dq = dq_pad_fn(dq_unpad)
  480. if query_unused_mask is not None:
  481. dq.masked_fill_(q_zero_masking, 0.0)
  482. print(f"dQ max diff: {(dq - dq_ref).abs().max().item()}")
  483. print(f"dK max diff: {(dk - dk_ref).abs().max().item()}")
  484. print(f"dV max diff: {(dv - dv_ref).abs().max().item()}")
  485. print(f"dQ mean diff: {(dq - dq_ref).abs().mean().item()}")
  486. print(f"dK mean diff: {(dk - dk_ref).abs().mean().item()}")
  487. print(f"dV mean diff: {(dv - dv_ref).abs().mean().item()}")
  488. print(f"dQ Pytorch max diff: {(dq_pt - dq_ref).abs().max().item()}")
  489. print(f"dK Pytorch max diff: {(dk_pt - dk_ref).abs().max().item()}")
  490. print(f"dV Pytorch max diff: {(dv_pt - dv_ref).abs().max().item()}")
  491. print(f"dQ Pytorch mean diff: {(dq_pt - dq_ref).abs().mean().item()}")
  492. print(f"dK Pytorch mean diff: {(dk_pt - dk_ref).abs().mean().item()}")
  493. print(f"dV Pytorch mean diff: {(dv_pt - dv_ref).abs().mean().item()}")
  494. # Check that FlashAttention's numerical error is at most twice the numerical error
  495. # of a Pytorch implementation.
  496. assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item()
  497. if d <= 128:
  498. assert (dq - dq_ref).abs().max().item() < 1e-4 or (dq - dq_ref).abs().max().item() <= 3 * (dq_pt - dq_ref).abs().max().item()
  499. assert (dk - dk_ref).abs().max().item() < 1e-4 or (dk - dk_ref).abs().max().item() <= 3 * (dk_pt - dk_ref).abs().max().item()
  500. assert (dv - dv_ref).abs().max().item() < 1e-4 or (dv - dv_ref).abs().max().item() <= 3 * (dv_pt - dv_ref).abs().max().item()
  501. @pytest.mark.parametrize("dtype_fp8", [torch.float8_e4m3fn])
  502. # @pytest.mark.parametrize("mha_type", ["mha", "mqa", "gqa"])
  503. @pytest.mark.parametrize("mha_type", ["mha"])
  504. @pytest.mark.parametrize("causal", [False, True])
  505. # @pytest.mark.parametrize("causal", [False])
  506. # @pytest.mark.parametrize("local", [False, True])
  507. @pytest.mark.parametrize("local", [False])
  508. # @pytest.mark.parametrize("deterministic", [False, True])
  509. @pytest.mark.parametrize("deterministic", [True])
  510. # @pytest.mark.parametrize("add_unused_qkv", [False, True])
  511. @pytest.mark.parametrize("add_unused_qkv", [True])
  512. # @pytest.mark.parametrize("d", [32, 59, 64, 80, 96, 111, 128, 160, 192, 224, 256])
  513. # @pytest.mark.parametrize("d", [32, 64, 96, 128, 160, 192, 224, 256])
  514. # @pytest.mark.parametrize('d', [256])
  515. # @pytest.mark.parametrize("d", [64, 128, 256])
  516. # @pytest.mark.parametrize("d", [64, 128])
  517. @pytest.mark.parametrize("d", [128])
  518. @pytest.mark.parametrize("descale", [1.0])
  519. @pytest.mark.parametrize(
  520. "seqlen_q,seqlen_k",
  521. [
  522. # (1, 1),
  523. # (1, 3),
  524. # (2, 1),
  525. # (511, 1),
  526. # (3, 513),
  527. # (64, 128),
  528. # (113, 203),
  529. # (128, 128),
  530. # (128, 217),
  531. # (113, 211),
  532. # (108, 256),
  533. # (256, 512),
  534. # (384, 256),
  535. # (512, 256),
  536. # (640, 128),
  537. (1024, 1024),
  538. (1023, 1024),
  539. (1024, 1023),
  540. (2048, 2048),
  541. (4096, 4096),
  542. ],
  543. )
  544. # @pytest.mark.parametrize('seqlen_q,seqlen_k', [(128, 128)])
  545. def test_flash_attn_varlen_fp8_output(
  546. seqlen_q, seqlen_k, d, causal, local, deterministic, add_unused_qkv, mha_type, dtype_fp8, descale
  547. ):
  548. print(dtype_fp8, causal)
  549. dtype = torch.bfloat16
  550. if (
  551. max(seqlen_q, seqlen_k) >= 2048
  552. and torch.cuda.get_device_properties("cuda").total_memory <= 16 * 2**30
  553. ):
  554. pytest.skip() # Reference implementation OOM
  555. device = "cuda"
  556. # set seed
  557. torch.random.manual_seed(0)
  558. # batch_size = 1
  559. # nheads = 1
  560. # nheads_kv = 1
  561. batch_size = 9
  562. nheads = 6
  563. nheads_kv = nheads if mha_type == "mha" else (2 if mha_type == "gqa" else 1)
  564. window_size = (-1, -1) if not local else torch.randint(0, seqlen_k, (2,))
  565. q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype, requires_grad=True)
  566. k = torch.randn(
  567. batch_size, seqlen_k, nheads_kv, d, device=device, dtype=dtype, requires_grad=True
  568. )
  569. v = torch.randn(
  570. batch_size, seqlen_k, nheads_kv, d, device=device, dtype=dtype, requires_grad=True
  571. )
  572. q = q.to(dtype_fp8).to(dtype)
  573. k = k.to(dtype_fp8).to(dtype)
  574. v = v.to(dtype_fp8).to(dtype)
  575. descale_q = torch.tensor([descale], dtype=torch.float32, device='cuda')
  576. descale_k = torch.tensor([descale], dtype=torch.float32, device='cuda')
  577. descale_v = torch.tensor([descale], dtype=torch.float32, device='cuda')
  578. query_padding_mask = generate_random_padding_mask(seqlen_q, batch_size, device, mode="random", zero_lengths=False)
  579. key_padding_mask = generate_random_padding_mask(seqlen_k, batch_size, device, mode="random", zero_lengths=True)
  580. # key_padding_mask = generate_random_padding_mask(seqlen_k, batch_size, device, mode='full')
  581. def _gen_unused_masks(padding_mask, add_unused, max_seq_len, bs, device):
  582. if add_unused:
  583. another_mask = generate_random_padding_mask(max_seq_len, bs, device)
  584. attn_mask = torch.logical_and(padding_mask, another_mask)
  585. unused_mask = torch.logical_xor(torch.logical_or(padding_mask, another_mask), attn_mask)
  586. else:
  587. attn_mask = padding_mask
  588. unused_mask = None
  589. return attn_mask, unused_mask
  590. query_padding_mask, query_unused_mask = _gen_unused_masks(query_padding_mask, add_unused_qkv, seqlen_q, batch_size, q.device)
  591. key_padding_mask, key_unused_mask = _gen_unused_masks(key_padding_mask, add_unused_qkv, seqlen_k, batch_size, k.device)
  592. (
  593. q_unpad,
  594. k_unpad,
  595. v_unpad,
  596. cu_seqlens_q,
  597. cu_seqlens_k,
  598. seqused_q,
  599. seqused_k,
  600. max_seqlen_q,
  601. max_seqlen_k,
  602. q,
  603. k,
  604. v,
  605. output_pad_fn,
  606. dq_pad_fn,
  607. dk_pad_fn,
  608. ) = generate_qkv(q, k, v, query_padding_mask, key_padding_mask, kvpacked=False, query_unused_mask=query_unused_mask, key_unused_mask=key_unused_mask)
  609. # print("cu_seqlens_q: ", cu_seqlens_q)
  610. # print("cu_seqlens_k: ", cu_seqlens_k)
  611. # print("q_unpad, shape: ", q_unpad.shape)
  612. # print("k_unpad, shape: ", k_unpad.shape)
  613. # print("v_unpad, shape: ", v_unpad.shape)
  614. q_unpad = q_unpad.to(dtype_fp8)
  615. k_unpad = k_unpad.to(dtype_fp8)
  616. v_unpad = v_unpad.to(dtype_fp8)
  617. # print(cu_seqlens_q)
  618. # print(cu_seqlens_k)
  619. # print(max_seqlen_q)
  620. # print(max_seqlen_k)
  621. # torch.cuda.synchronize()
  622. out_unpad, sm_lse = flash_attn_varlen_func(
  623. q_unpad,
  624. k_unpad,
  625. v_unpad,
  626. cu_seqlens_q,
  627. cu_seqlens_k,
  628. max_seqlen_q,
  629. max_seqlen_k,
  630. causal=causal,
  631. deterministic=deterministic,
  632. seqused_q=seqused_q,
  633. seqused_k=seqused_k,
  634. window_size=window_size,
  635. descale_q=descale_q,
  636. descale_k=descale_k,
  637. descale_v=descale_v
  638. )
  639. out = output_pad_fn(out_unpad)
  640. if query_unused_mask is not None:
  641. q_zero_masking = rearrange(query_unused_mask, "b s -> b s 1 1")
  642. out.masked_fill_(q_zero_masking, 0.0)
  643. dropout_mask = None
  644. descale_q = descale_q.to(dtype)
  645. descale_k = descale_k.to(dtype)
  646. descale_v = descale_v.to(dtype)
  647. q = q * descale_q
  648. k = k * descale_k
  649. v = v * descale_v
  650. # print(out)
  651. out_ref, attn_ref = attention_ref(
  652. q,
  653. k,
  654. v,
  655. query_padding_mask,
  656. key_padding_mask,
  657. causal=causal,
  658. window_size=window_size,
  659. )
  660. out_pt, attn_pt = attention_ref(
  661. q,
  662. k,
  663. v,
  664. query_padding_mask,
  665. key_padding_mask,
  666. causal=causal,
  667. window_size=window_size,
  668. upcast=False,
  669. reorder_ops=True,
  670. )
  671. print(f"Output max diff: {(out - out_ref).abs().max().item()}")
  672. print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
  673. print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}")
  674. print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}")
  675. # atol = 4 * (out_pt - out_ref).abs().max().item() + 1e-2
  676. atol = 1e-2
  677. try:
  678. torch.testing.assert_close(out, out_ref, rtol=1e-2, atol=atol, check_dtype=False)
  679. except AssertionError as e:
  680. # Calculate absolute differences
  681. diff = (out - out_ref).abs()
  682. # Get the indices of the top 2 maximum differences
  683. top2_diff_values, top2_diff_indices = torch.topk(diff.flatten(), k=2)
  684. # Print the results
  685. for i, idx in enumerate(top2_diff_indices):
  686. coords = torch.unravel_index(idx, diff.shape)
  687. print(f"Entry {i+1} with greatest diff: Coordinates: {coords}, "
  688. f"out: {out[coords].item()}, out_ref: {out_ref[coords].item()}, "
  689. f"diff: {top2_diff_values[i].item()}")
  690. # Check that FlashAttention's numerical error is at most twice the numerical error
  691. # of a Pytorch implementation.
  692. # assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item()