test_flash_attn.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539
  1. import math
  2. import pytest
  3. import torch
  4. import torch.nn.functional as F
  5. from einops import rearrange, repeat
  6. from flash_attn_interface import flash_attn_func, flash_attn_varlen_func, _flash_attn_forward
  7. from tests.test_util import generate_random_padding_mask, generate_qkv, construct_local_mask, attention_ref
  8. ABS_TOL = 5e-3
  9. REL_TOL = 1e-1
  10. def print_diffs(out, out_ref):
  11. out_1d = out.flatten()
  12. out_ref_1d = out_ref.flatten()
  13. for idx, (e_o, e_o_ref) in enumerate(zip(out_1d, out_ref_1d)):
  14. diff = e_o - e_o_ref
  15. abs_diff = abs(diff)
  16. abs_ref = abs(e_o_ref + 1e-5)
  17. relative_diff = abs_diff / abs_ref
  18. if abs_diff > ABS_TOL or relative_diff > REL_TOL:
  19. print(f"==== diff ==== {idx}, test: {e_o}, ref: {e_o_ref}")
  20. @pytest.mark.parametrize("dtype", [torch.float8_e4m3fn])
  21. @pytest.mark.parametrize("mha_type", ["mha", "mqa", "gqa"])
  22. @pytest.mark.parametrize("causal", [False, True])
  23. @pytest.mark.parametrize("local", [False, True])
  24. @pytest.mark.parametrize("deterministic", [True])
  25. @pytest.mark.parametrize("gqa_parallel", [False, True])
  26. @pytest.mark.parametrize("d", [64, 128, 256])
  27. # @pytest.mark.parametrize("descale", [1.0])
  28. @pytest.mark.parametrize("descale", [1.0, 2.0, 3.0])
  29. @pytest.mark.parametrize(
  30. "seqlen_q,seqlen_k",
  31. [
  32. (1, 1),
  33. (64, 128),
  34. (128, 128),
  35. (256, 256),
  36. (113, 203),
  37. (128, 217),
  38. (113, 211),
  39. (108, 256),
  40. (256, 512),
  41. (384, 256),
  42. (640, 128),
  43. (512, 256),
  44. (1024, 1024),
  45. (1023, 1024),
  46. (1024, 1023),
  47. (4096, 4096),
  48. (4224, 4224),
  49. ],
  50. )
  51. def test_flash_attn_output_fp8(
  52. seqlen_q, seqlen_k, d, causal, local, deterministic, mha_type, dtype, descale, gqa_parallel
  53. ):
  54. device = "cuda"
  55. dtype_init = torch.bfloat16
  56. print(dtype)
  57. print('causal',causal)
  58. print('local',local)
  59. print('gqa_parallel',gqa_parallel)
  60. # set seed
  61. torch.random.manual_seed(42)
  62. # batch_size = 40
  63. # nheads = 16
  64. batch_size = 4
  65. nheads = 6
  66. nheads_kv = 6 if mha_type == "mha" else (2 if mha_type == "gqa" else 1)
  67. # nheads_kv = 1
  68. # batch_size = 9
  69. # nheads = 6
  70. window_size = (-1, -1) if not local else torch.randint(0, seqlen_k, (2,))
  71. q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype_init, requires_grad=True)
  72. k = torch.randn(batch_size, seqlen_k, nheads_kv, d, device=device, dtype=dtype_init, requires_grad=True)
  73. v = torch.randn(batch_size, seqlen_k, nheads_kv, d, device=device, dtype=dtype_init, requires_grad=True)
  74. q = q.to(dtype)
  75. k = k.to(dtype)
  76. v = v.to(dtype)
  77. softmax_scale = q.shape[-1] ** (-0.5)
  78. descale_q = torch.tensor([descale], dtype=torch.float32, device='cuda')
  79. descale_k = torch.tensor([descale], dtype=torch.float32, device='cuda')
  80. descale_v = torch.tensor([descale], dtype=torch.float32, device='cuda')
  81. out, lse = flash_attn_func(q, k, v, causal=causal, window_size=window_size, deterministic=deterministic, gqa_parallel=gqa_parallel,
  82. descale_q=descale_q, descale_k=descale_k, descale_v=descale_v)
  83. q = q.to(dtype_init)
  84. k = k.to(dtype_init)
  85. v = v.to(dtype_init)
  86. descale_q = descale_q.to(dtype_init)
  87. descale_k = descale_k.to(dtype_init)
  88. descale_v = descale_v.to(dtype_init)
  89. q = q * descale_q
  90. k = k * descale_k
  91. v = v * descale_v
  92. out_ref, attn_ref = attention_ref(
  93. q,
  94. k,
  95. v,
  96. None,
  97. None,
  98. causal=causal,
  99. window_size=window_size,
  100. )
  101. out_pt, attn_pt = attention_ref(
  102. q,
  103. k,
  104. v,
  105. None,
  106. None,
  107. causal=causal,
  108. window_size=window_size,
  109. upcast=False,
  110. reorder_ops=True,
  111. )
  112. # qk = torch.einsum('bshd,bthd->bhst', q, k).float()
  113. # m = qk.amax(-1, keepdim=True)
  114. # s_tmp = torch.exp((qk - m) / math.sqrt(d))
  115. # exp_sum = s_tmp.sum(-1)
  116. # qk = torch.einsum('bthd,bshd->bhts', q.float() / math.sqrt(d), k.float())
  117. # lse_ref = torch.logsumexp(qk, dim=-1)
  118. print(f"Output max diff: {(out - out_ref).abs().max().item()}")
  119. print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
  120. print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}")
  121. print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}")
  122. # if not causal:
  123. # print(f"LSE max diff: {(lse - lse_ref).abs().max().item()}")
  124. # breakpoint()
  125. # dS = torch.einsum('bthd,bshd->bhts', g.float(), v.float())
  126. # P = torch.softmax(qk, -1)
  127. # dP = P * (dS - do_o.unsqueeze(1))
  128. # dQ = torch.einsum('bhts,bshd->bthd', dP, k.float())
  129. # dV = torch.einsum('bhts,bthd->bshd', P, g.float())
  130. # dK = torch.einsum('bhts,bthd->bshd', dP, q.float())
  131. # breakpoint()
  132. # assert (out - out_ref).abs().max().item() <= 4 * (out_pt - out_ref).abs().max().item() + 1e-2
  133. atol = 4 * (out_pt - out_ref).abs().max().item() + 1e-2
  134. torch.testing.assert_close(out, out_ref, rtol=1e-2, atol=atol, check_dtype=False)
  135. @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
  136. # @pytest.mark.parametrize("dtype", [torch.float8_e4m3fn])
  137. @pytest.mark.parametrize("mha_type", ["mha", "mqa", "gqa"])
  138. # @pytest.mark.parametrize("mha_type", ["mha"])
  139. @pytest.mark.parametrize("causal", [False, True])
  140. # @pytest.mark.parametrize("causal", [False])
  141. @pytest.mark.parametrize("local", [False, True])
  142. # @pytest.mark.parametrize("local", [True])
  143. @pytest.mark.parametrize("deterministic", [False, True])
  144. # @pytest.mark.parametrize("deterministic", [True])
  145. @pytest.mark.parametrize("gqa_parallel", [False, True])
  146. # @pytest.mark.parametrize("gqa_parallel", [False])
  147. # @pytest.mark.parametrize("d", [32, 64, 96, 128, 160, 192, 224, 256])
  148. # @pytest.mark.parametrize('d', [32, 40, 64, 80, 96, 128, 160, 192])
  149. # @pytest.mark.parametrize('d', [32, 64, 96, 128, 160, 192])
  150. # @pytest.mark.parametrize('d', [56, 80])
  151. # @pytest.mark.parametrize("d", [64, 128, 256])
  152. # @pytest.mark.parametrize("d", [64, 96, 128])
  153. # @pytest.mark.parametrize("d", [64])
  154. @pytest.mark.parametrize("d", [64, 128, 256])
  155. @pytest.mark.parametrize("descale", [1.0])
  156. # @pytest.mark.parametrize("descale", [1.0, 2.0, 3.0, 4.0])
  157. @pytest.mark.parametrize(
  158. "seqlen_q,seqlen_k",
  159. [
  160. (1, 1),
  161. (64, 128),
  162. (128, 128),
  163. (256, 256),
  164. (113, 203),
  165. (128, 217),
  166. (113, 211),
  167. (108, 256),
  168. (256, 512),
  169. (384, 256),
  170. (640, 128),
  171. (512, 256),
  172. (1024, 1024),
  173. (1023, 1024),
  174. (1024, 1023),
  175. (4096, 4096),
  176. (4224, 4224),
  177. ],
  178. )
  179. # @pytest.mark.parametrize('seqlen_q,seqlen_k', [(128, 128)])
  180. def test_flash_attn_output(
  181. seqlen_q, seqlen_k, d, causal, local, deterministic, mha_type, dtype, descale, gqa_parallel
  182. ):
  183. device = "cuda"
  184. if(dtype == torch.float8_e4m3fn):
  185. dtype_init = torch.bfloat16
  186. else:
  187. dtype_init = dtype
  188. print(dtype)
  189. print('causal',causal)
  190. print('local',local)
  191. print('gqa_parallel',gqa_parallel)
  192. # set seed
  193. torch.random.manual_seed(42)
  194. # batch_size = 40
  195. # nheads = 16
  196. batch_size = 4
  197. nheads = 6
  198. nheads_kv = 6 if mha_type == "mha" else (2 if mha_type == "gqa" else 1)
  199. # nheads_kv = 1
  200. # batch_size = 9
  201. # nheads = 6
  202. window_size = (-1, -1) if not local else torch.randint(0, seqlen_k, (2,))
  203. q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype_init, requires_grad=True)
  204. k = torch.randn(batch_size, seqlen_k, nheads_kv, d, device=device, dtype=dtype_init, requires_grad=True)
  205. v = torch.randn(batch_size, seqlen_k, nheads_kv, d, device=device, dtype=dtype_init, requires_grad=True)
  206. q = q.to(dtype)
  207. k = k.to(dtype)
  208. v = v.to(dtype)
  209. softmax_scale = q.shape[-1] ** (-0.5)
  210. descale_q = torch.tensor([descale], dtype=torch.float32, device='cuda')
  211. descale_k = torch.tensor([descale], dtype=torch.float32, device='cuda')
  212. descale_v = torch.tensor([descale], dtype=torch.float32, device='cuda')
  213. if(dtype != torch.float8_e4m3fn):
  214. out, lse = flash_attn_func(q, k, v, causal=causal, window_size=window_size, deterministic=deterministic, gqa_parallel=gqa_parallel)
  215. else:
  216. out, lse = flash_attn_func(q, k, v, causal=causal, window_size=window_size, deterministic=deterministic, gqa_parallel=gqa_parallel,
  217. descale_q=descale_q, descale_k=descale_k, descale_v=descale_v)
  218. q = q.to(dtype_init)
  219. k = k.to(dtype_init)
  220. v = v.to(dtype_init)
  221. if(dtype == torch.float8_e4m3fn):
  222. descale_q = descale_q.to(dtype_init)
  223. descale_k = descale_k.to(dtype_init)
  224. descale_v = descale_v.to(dtype_init)
  225. q = q * descale_q
  226. k = k * descale_k
  227. v = v * descale_v
  228. out_ref, attn_ref = attention_ref(
  229. q,
  230. k,
  231. v,
  232. None,
  233. None,
  234. causal=causal,
  235. window_size=window_size,
  236. )
  237. out_pt, attn_pt = attention_ref(
  238. q,
  239. k,
  240. v,
  241. None,
  242. None,
  243. causal=causal,
  244. window_size=window_size,
  245. upcast=False,
  246. reorder_ops=True,
  247. )
  248. # qk = torch.einsum('bshd,bthd->bhst', q, k).float()
  249. # m = qk.amax(-1, keepdim=True)
  250. # s_tmp = torch.exp((qk - m) / math.sqrt(d))
  251. # exp_sum = s_tmp.sum(-1)
  252. # qk = torch.einsum('bthd,bshd->bhts', q.float() / math.sqrt(d), k.float())
  253. # lse_ref = torch.logsumexp(qk, dim=-1)
  254. print(f"Output max diff: {(out - out_ref).abs().max().item()}")
  255. print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
  256. print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}")
  257. print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}")
  258. # if not causal:
  259. # print(f"LSE max diff: {(lse - lse_ref).abs().max().item()}")
  260. # breakpoint()
  261. if d <= 128 and dtype != torch.float8_e4m3fn:
  262. g = torch.randn_like(out)
  263. do_o = (g.float() * out.float()).sum(-1)
  264. dq, dk, dv = torch.autograd.grad(out, (q, k, v), g)
  265. dq_ref, dk_ref, dv_ref = torch.autograd.grad(out_ref, (q, k, v), g)
  266. dq_pt, dk_pt, dv_pt = torch.autograd.grad(out_pt, (q, k, v), g)
  267. print(f"dQ max diff: {(dq - dq_ref).abs().max().item()}")
  268. print(f"dK max diff: {(dk - dk_ref).abs().max().item()}")
  269. print(f"dV max diff: {(dv - dv_ref).abs().max().item()}")
  270. print(f"dQ mean diff: {(dq - dq_ref).abs().mean().item()}")
  271. print(f"dK mean diff: {(dk - dk_ref).abs().mean().item()}")
  272. print(f"dV mean diff: {(dv - dv_ref).abs().mean().item()}")
  273. print(f"dQ Pytorch max diff: {(dq_pt - dq_ref).abs().max().item()}")
  274. print(f"dK Pytorch max diff: {(dk_pt - dk_ref).abs().max().item()}")
  275. print(f"dV Pytorch max diff: {(dv_pt - dv_ref).abs().max().item()}")
  276. print(f"dQ Pytorch mean diff: {(dq_pt - dq_ref).abs().mean().item()}")
  277. print(f"dK Pytorch mean diff: {(dk_pt - dk_ref).abs().mean().item()}")
  278. print(f"dV Pytorch mean diff: {(dv_pt - dv_ref).abs().mean().item()}")
  279. # dS = torch.einsum('bthd,bshd->bhts', g.float(), v.float())
  280. # P = torch.softmax(qk, -1)
  281. # dP = P * (dS - do_o.unsqueeze(1))
  282. # dQ = torch.einsum('bhts,bshd->bthd', dP, k.float())
  283. # dV = torch.einsum('bhts,bthd->bshd', P, g.float())
  284. # dK = torch.einsum('bhts,bthd->bshd', dP, q.float())
  285. # breakpoint()
  286. # Check that FlashAttention's numerical error is at most twice the numerical error
  287. # of a Pytorch implementation.
  288. # breakpoint()
  289. if(dtype != torch.float8_e4m3fn):
  290. assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item() + 3e-5
  291. else:
  292. # just test correctness of fp8 kernel w/o further quantization techniques
  293. assert (out - out_ref).abs().max().item() <= 4 * (out_pt - out_ref).abs().max().item() + 2e-2
  294. if d <= 128 and dtype != torch.float8_e4m3fn:
  295. assert (dq - dq_ref).abs().max().item() <= 2 * (dq_pt - dq_ref).abs().max().item() + 3e-5
  296. assert (dk - dk_ref).abs().max().item() <= 2 * (dk_pt - dk_ref).abs().max().item() + 3e-5
  297. assert (dv - dv_ref).abs().max().item() <= 2 * (dv_pt - dv_ref).abs().max().item() + 3e-5
  298. @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
  299. # @pytest.mark.parametrize("dtype", [torch.float16])
  300. @pytest.mark.parametrize("mha_type", ["mha", "mqa", "gqa"])
  301. # @pytest.mark.parametrize("mha_type", ["mha"])
  302. @pytest.mark.parametrize("causal", [False, True])
  303. # @pytest.mark.parametrize("causal", [True])
  304. @pytest.mark.parametrize("local", [False, True])
  305. # @pytest.mark.parametrize("local", [False])
  306. @pytest.mark.parametrize("deterministic", [False, True])
  307. # @pytest.mark.parametrize("deterministic", [False])
  308. @pytest.mark.parametrize("add_unused_qkv", [False, True])
  309. # @pytest.mark.parametrize("add_unused_qkv", [True])
  310. # @pytest.mark.parametrize("d", [32, 59, 64, 80, 96, 111, 128, 160, 192, 224, 256])
  311. # @pytest.mark.parametrize("d", [32, 64, 96, 128, 160, 192, 224, 256])
  312. # @pytest.mark.parametrize('d', [256])
  313. # @pytest.mark.parametrize("d", [64, 128, 256])
  314. @pytest.mark.parametrize("d", [64, 128])
  315. # @pytest.mark.parametrize("d", [128])
  316. @pytest.mark.parametrize(
  317. "seqlen_q,seqlen_k",
  318. [
  319. (1, 1),
  320. (1, 3),
  321. (2, 1),
  322. (511, 1),
  323. (3, 513),
  324. (64, 128),
  325. (113, 203),
  326. (128, 128),
  327. (128, 217),
  328. (113, 211),
  329. (108, 256),
  330. (256, 512),
  331. (384, 256),
  332. (512, 256),
  333. (640, 128),
  334. (1024, 1024),
  335. (1023, 1024),
  336. (1024, 1023),
  337. (2048, 2048),
  338. ],
  339. )
  340. # @pytest.mark.parametrize('seqlen_q,seqlen_k', [(128, 128)])
  341. def test_flash_attn_varlen_output(
  342. seqlen_q, seqlen_k, d, causal, local, deterministic, add_unused_qkv, mha_type, dtype
  343. ):
  344. if (
  345. max(seqlen_q, seqlen_k) >= 2048
  346. and torch.cuda.get_device_properties("cuda").total_memory <= 16 * 2**30
  347. ):
  348. pytest.skip() # Reference implementation OOM
  349. device = "cuda"
  350. # set seed
  351. torch.random.manual_seed(0)
  352. # batch_size = 1
  353. # nheads = 1
  354. # nheads_kv = 1
  355. batch_size = 9
  356. nheads = 6
  357. nheads_kv = 6 if mha_type == "mha" else (2 if mha_type == "gqa" else 1)
  358. window_size = (-1, -1) if not local else torch.randint(0, seqlen_k, (2,))
  359. q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype, requires_grad=True)
  360. k = torch.randn(
  361. batch_size, seqlen_k, nheads_kv, d, device=device, dtype=dtype, requires_grad=True
  362. )
  363. v = torch.randn(
  364. batch_size, seqlen_k, nheads_kv, d, device=device, dtype=dtype, requires_grad=True
  365. )
  366. query_padding_mask = generate_random_padding_mask(seqlen_q, batch_size, device, mode="random", zero_lengths=False)
  367. key_padding_mask = generate_random_padding_mask(seqlen_k, batch_size, device, mode="random", zero_lengths=True)
  368. # key_padding_mask = generate_random_padding_mask(seqlen_k, batch_size, device, mode='full')
  369. def _gen_unused_masks(padding_mask, add_unused, max_seq_len, bs, device):
  370. if add_unused:
  371. another_mask = generate_random_padding_mask(max_seq_len, bs, device)
  372. attn_mask = torch.logical_and(padding_mask, another_mask)
  373. unused_mask = torch.logical_xor(torch.logical_or(padding_mask, another_mask), attn_mask)
  374. else:
  375. attn_mask = padding_mask
  376. unused_mask = None
  377. return attn_mask, unused_mask
  378. query_padding_mask, query_unused_mask = _gen_unused_masks(query_padding_mask, add_unused_qkv, seqlen_q, batch_size, q.device)
  379. key_padding_mask, key_unused_mask = _gen_unused_masks(key_padding_mask, add_unused_qkv, seqlen_k, batch_size, k.device)
  380. (
  381. q_unpad,
  382. k_unpad,
  383. v_unpad,
  384. cu_seqlens_q,
  385. cu_seqlens_k,
  386. seqused_q,
  387. seqused_k,
  388. max_seqlen_q,
  389. max_seqlen_k,
  390. q,
  391. k,
  392. v,
  393. output_pad_fn,
  394. dq_pad_fn,
  395. dk_pad_fn,
  396. ) = generate_qkv(q, k, v, query_padding_mask, key_padding_mask, kvpacked=False, query_unused_mask=query_unused_mask, key_unused_mask=key_unused_mask)
  397. # print("cu_seqlens_q: ", cu_seqlens_q)
  398. # print("cu_seqlens_k: ", cu_seqlens_k)
  399. # print("q_unpad, shape: ", q_unpad.shape)
  400. # print("k_unpad, shape: ", k_unpad.shape)
  401. # print("v_unpad, shape: ", v_unpad.shape)
  402. out_unpad, sm_lse = flash_attn_varlen_func(
  403. q_unpad,
  404. k_unpad,
  405. v_unpad,
  406. cu_seqlens_q,
  407. cu_seqlens_k,
  408. max_seqlen_q,
  409. max_seqlen_k,
  410. causal=causal,
  411. deterministic=deterministic,
  412. seqused_q=seqused_q,
  413. seqused_k=seqused_k,
  414. window_size=window_size,
  415. )
  416. out = output_pad_fn(out_unpad)
  417. if query_unused_mask is not None:
  418. q_zero_masking = rearrange(query_unused_mask, "b s -> b s 1 1")
  419. out.masked_fill_(q_zero_masking, 0.0)
  420. dropout_mask = None
  421. out_ref, attn_ref = attention_ref(
  422. q,
  423. k,
  424. v,
  425. query_padding_mask,
  426. key_padding_mask,
  427. causal=causal,
  428. window_size=window_size,
  429. )
  430. out_pt, attn_pt = attention_ref(
  431. q,
  432. k,
  433. v,
  434. query_padding_mask,
  435. key_padding_mask,
  436. causal=causal,
  437. window_size=window_size,
  438. upcast=False,
  439. reorder_ops=True,
  440. )
  441. print(f"Output max diff: {(out - out_ref).abs().max().item()}")
  442. print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
  443. print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}")
  444. print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}")
  445. g = torch.randn_like(out)
  446. if d <= 128:
  447. (
  448. dq_unpad,
  449. dk_unpad,
  450. dv_unpad,
  451. ) = torch.autograd.grad(out, (q_unpad, k_unpad, v_unpad), g)
  452. dk = dk_pad_fn(dk_unpad)
  453. dv = dk_pad_fn(dv_unpad)
  454. if key_unused_mask is not None:
  455. k_zero_masking = rearrange(key_unused_mask, "b s -> b s 1 1")
  456. dk.masked_fill_(k_zero_masking, 0.0)
  457. dv.masked_fill_(k_zero_masking, 0.0)
  458. (
  459. dq_ref,
  460. dk_ref,
  461. dv_ref,
  462. ) = torch.autograd.grad(out_ref, (q, k, v), g)
  463. zero_masking = rearrange(torch.logical_not(torch.any(key_padding_mask, 1)), "b -> b 1 1 1")
  464. dk_ref.masked_fill_(zero_masking, 0.0)
  465. dv_ref.masked_fill_(zero_masking, 0.0)
  466. (
  467. dq_pt,
  468. dk_pt,
  469. dv_pt,
  470. ) = torch.autograd.grad(out_pt, (q, k, v), g)
  471. dk_pt.masked_fill_(zero_masking, 0.0)
  472. dv_pt.masked_fill_(zero_masking, 0.0)
  473. dq = dq_pad_fn(dq_unpad)
  474. if query_unused_mask is not None:
  475. dq.masked_fill_(q_zero_masking, 0.0)
  476. print(f"dQ max diff: {(dq - dq_ref).abs().max().item()}")
  477. print(f"dK max diff: {(dk - dk_ref).abs().max().item()}")
  478. print(f"dV max diff: {(dv - dv_ref).abs().max().item()}")
  479. print(f"dQ mean diff: {(dq - dq_ref).abs().mean().item()}")
  480. print(f"dK mean diff: {(dk - dk_ref).abs().mean().item()}")
  481. print(f"dV mean diff: {(dv - dv_ref).abs().mean().item()}")
  482. print(f"dQ Pytorch max diff: {(dq_pt - dq_ref).abs().max().item()}")
  483. print(f"dK Pytorch max diff: {(dk_pt - dk_ref).abs().max().item()}")
  484. print(f"dV Pytorch max diff: {(dv_pt - dv_ref).abs().max().item()}")
  485. print(f"dQ Pytorch mean diff: {(dq_pt - dq_ref).abs().mean().item()}")
  486. print(f"dK Pytorch mean diff: {(dk_pt - dk_ref).abs().mean().item()}")
  487. print(f"dV Pytorch mean diff: {(dv_pt - dv_ref).abs().mean().item()}")
  488. # Check that FlashAttention's numerical error is at most twice the numerical error
  489. # of a Pytorch implementation.
  490. assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item()
  491. if d <= 128:
  492. assert (dq - dq_ref).abs().max().item() < 1e-4 or (dq - dq_ref).abs().max().item() <= 3 * (dq_pt - dq_ref).abs().max().item()
  493. assert (dk - dk_ref).abs().max().item() < 1e-4 or (dk - dk_ref).abs().max().item() <= 3 * (dk_pt - dk_ref).abs().max().item()
  494. assert (dk - dk_ref).abs().max().item() < 1e-4 or (dv - dv_ref).abs().max().item() <= 3 * (dv_pt - dv_ref).abs().max().item()