test_flash_attn.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347
  1. import math
  2. import pytest
  3. import torch
  4. import torch.nn.functional as F
  5. from einops import rearrange, repeat
  6. from flash_attn_interface import flash_attn_func, flash_attn_varlen_func
  7. from tests.test_util import generate_random_padding_mask, generate_qkv, construct_local_mask, attention_ref
  8. ABS_TOL = 5e-3
  9. REL_TOL = 1e-1
  10. def print_diffs(out, out_ref):
  11. out_1d = out.flatten()
  12. out_ref_1d = out_ref.flatten()
  13. for idx, (e_o, e_o_ref) in enumerate(zip(out_1d, out_ref_1d)):
  14. diff = e_o - e_o_ref
  15. abs_diff = abs(diff)
  16. abs_ref = abs(e_o_ref + 1e-5)
  17. relative_diff = abs_diff / abs_ref
  18. if abs_diff > ABS_TOL or relative_diff > REL_TOL:
  19. print(f"==== diff ==== {idx}, test: {e_o}, ref: {e_o_ref}")
  20. @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
  21. # @pytest.mark.parametrize("dtype", [torch.bfloat16])
  22. @pytest.mark.parametrize("mha_type", ["mha", "mqa", "gqa"])
  23. # @pytest.mark.parametrize("mha_type", ["gqa"])
  24. @pytest.mark.parametrize("causal", [False, True])
  25. # @pytest.mark.parametrize("causal", [True])
  26. # @pytest.mark.parametrize("d", [32, 64, 96, 128, 160, 192, 224, 256])
  27. # @pytest.mark.parametrize('d', [32, 40, 64, 80, 96, 128, 160, 192])
  28. # @pytest.mark.parametrize('d', [32, 64, 96, 128, 160, 192])
  29. # @pytest.mark.parametrize('d', [56, 80])
  30. @pytest.mark.parametrize("d", [64, 128, 256])
  31. # @pytest.mark.parametrize("d", [128])
  32. @pytest.mark.parametrize(
  33. "seqlen_q,seqlen_k",
  34. [
  35. (257, 1),
  36. (64, 128),
  37. (128, 128),
  38. (256, 256),
  39. (113, 203),
  40. (128, 217),
  41. (113, 211),
  42. (108, 256),
  43. (256, 512),
  44. (384, 256),
  45. (640, 128),
  46. (512, 256),
  47. (1024, 1024),
  48. (1023, 1024),
  49. (1024, 1023),
  50. (2048, 2048),
  51. ],
  52. )
  53. # @pytest.mark.parametrize('seqlen_q,seqlen_k', [(128, 128)])
  54. def test_flash_attn_output(
  55. seqlen_q, seqlen_k, d, causal, mha_type, dtype
  56. ):
  57. device = "cuda"
  58. # set seed
  59. torch.random.manual_seed(0)
  60. # batch_size = 40
  61. # nheads = 16
  62. batch_size = 9
  63. nheads = 6
  64. nheads_kv = 6 if mha_type == "mha" else (2 if mha_type == "gqa" else 1)
  65. # nheads_kv = 2
  66. # batch_size = 9
  67. # nheads = 6
  68. q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype, requires_grad=True)
  69. k = torch.randn(batch_size, seqlen_k, nheads_kv, d, device=device, dtype=dtype, requires_grad=True)
  70. v = torch.randn(batch_size, seqlen_k, nheads_kv, d, device=device, dtype=dtype, requires_grad=True)
  71. out, lse = flash_attn_func(q, k, v, causal=causal)
  72. out_ref, attn_ref = attention_ref(
  73. q,
  74. k,
  75. v,
  76. None,
  77. None,
  78. causal=causal,
  79. )
  80. out_pt, attn_pt = attention_ref(
  81. q,
  82. k,
  83. v,
  84. None,
  85. None,
  86. causal=causal,
  87. upcast=False,
  88. reorder_ops=True,
  89. )
  90. # qk = torch.einsum('bshd,bthd->bhst', q, k).float()
  91. # m = qk.amax(-1, keepdim=True)
  92. # s_tmp = torch.exp((qk - m) / math.sqrt(d))
  93. # exp_sum = s_tmp.sum(-1)
  94. # qk = torch.einsum('bthd,bshd->bhts', q.float() / math.sqrt(d), k.float())
  95. # lse_ref = torch.logsumexp(qk, dim=-1)
  96. print(f"Output max diff: {(out - out_ref).abs().max().item()}")
  97. print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
  98. print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}")
  99. print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}")
  100. # if not causal:
  101. # print(f"LSE max diff: {(lse - lse_ref).abs().max().item()}")
  102. # breakpoint()
  103. # if d <= 128:
  104. # g = torch.randn_like(out)
  105. # do_o = (g.float() * out.float()).sum(-1)
  106. # dq, dk, dv = torch.autograd.grad(out, (q, k, v), g)
  107. # dq_ref, dk_ref, dv_ref = torch.autograd.grad(out_ref, (q, k, v), g)
  108. # dq_pt, dk_pt, dv_pt = torch.autograd.grad(out_pt, (q, k, v), g)
  109. # print(f"dQ max diff: {(dq - dq_ref).abs().max().item()}")
  110. # print(f"dK max diff: {(dk - dk_ref).abs().max().item()}")
  111. # print(f"dV max diff: {(dv - dv_ref).abs().max().item()}")
  112. # print(f"dQ mean diff: {(dq - dq_ref).abs().mean().item()}")
  113. # print(f"dK mean diff: {(dk - dk_ref).abs().mean().item()}")
  114. # print(f"dV mean diff: {(dv - dv_ref).abs().mean().item()}")
  115. # print(f"dQ Pytorch max diff: {(dq_pt - dq_ref).abs().max().item()}")
  116. # print(f"dK Pytorch max diff: {(dk_pt - dk_ref).abs().max().item()}")
  117. # print(f"dV Pytorch max diff: {(dv_pt - dv_ref).abs().max().item()}")
  118. # print(f"dQ Pytorch mean diff: {(dq_pt - dq_ref).abs().mean().item()}")
  119. # print(f"dK Pytorch mean diff: {(dk_pt - dk_ref).abs().mean().item()}")
  120. # print(f"dV Pytorch mean diff: {(dv_pt - dv_ref).abs().mean().item()}")
  121. # dS = torch.einsum('bthd,bshd->bhts', g.float(), v.float())
  122. # P = torch.softmax(qk, -1)
  123. # dP = P * (dS - do_o.unsqueeze(1))
  124. # dQ = torch.einsum('bhts,bshd->bthd', dP, k.float())
  125. # dV = torch.einsum('bhts,bthd->bshd', P, g.float())
  126. # dK = torch.einsum('bhts,bthd->bshd', dP, q.float())
  127. # breakpoint()
  128. # Check that FlashAttention's numerical error is at most twice the numerical error
  129. # of a Pytorch implementation.
  130. # breakpoint()
  131. assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item()
  132. # if d <= 128:
  133. # assert (dq - dq_ref).abs().max().item() <= 2 * (dq_pt - dq_ref).abs().max().item()
  134. # assert (dk - dk_ref).abs().max().item() <= 2 * (dk_pt - dk_ref).abs().max().item()
  135. # assert (dv - dv_ref).abs().max().item() <= 2 * (dv_pt - dv_ref).abs().max().item()
  136. @pytest.mark.parametrize("dtype", [torch.float16])
  137. @pytest.mark.parametrize("causal", [False, True])
  138. @pytest.mark.parametrize("mha_type", ["mha", "mqa", "gqa"])
  139. # @pytest.mark.parametrize('causal', [True])
  140. # @pytest.mark.parametrize("d", [32, 59, 64, 80, 96, 111, 128, 160, 192, 224, 256])
  141. # @pytest.mark.parametrize("d", [32, 64, 96, 128, 160, 192, 224, 256])
  142. # @pytest.mark.parametrize('d', [128])
  143. @pytest.mark.parametrize("d", [64, 128, 256])
  144. @pytest.mark.parametrize(
  145. "seqlen_q,seqlen_k",
  146. [
  147. (1, 1),
  148. (1, 3),
  149. (2, 1),
  150. (511, 1),
  151. (3, 513),
  152. (64, 128),
  153. (113, 203),
  154. (128, 128),
  155. (128, 217),
  156. (113, 211),
  157. (108, 256),
  158. (256, 512),
  159. (384, 256),
  160. (512, 256),
  161. (640, 128),
  162. (1024, 1024),
  163. (1023, 1024),
  164. (1024, 1023),
  165. (2048, 2048),
  166. ],
  167. )
  168. # @pytest.mark.parametrize('seqlen_q,seqlen_k', [(128, 128)])
  169. def test_flash_attn_varlen_output(
  170. seqlen_q, seqlen_k, d, causal, mha_type, dtype
  171. ):
  172. if (
  173. max(seqlen_q, seqlen_k) >= 2048
  174. and torch.cuda.get_device_properties("cuda").total_memory <= 16 * 2**30
  175. ):
  176. pytest.skip() # Reference implementation OOM
  177. device = "cuda"
  178. # set seed
  179. torch.random.manual_seed(0)
  180. # batch_size = 1
  181. # nheads = 1
  182. batch_size = 9
  183. nheads = 6
  184. nheads_kv = 6 if mha_type == "mha" else (2 if mha_type == "gqa" else 1)
  185. q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype, requires_grad=True)
  186. k = torch.randn(
  187. batch_size, seqlen_k, nheads_kv, d, device=device, dtype=dtype, requires_grad=True
  188. )
  189. v = torch.randn(
  190. batch_size, seqlen_k, nheads_kv, d, device=device, dtype=dtype, requires_grad=True
  191. )
  192. query_padding_mask = generate_random_padding_mask(seqlen_q, batch_size, device, mode="random")
  193. key_padding_mask = generate_random_padding_mask(seqlen_k, batch_size, device, mode="random")
  194. # key_padding_mask = generate_random_padding_mask(seqlen_k, batch_size, device, mode='full')
  195. (
  196. q_unpad,
  197. k_unpad,
  198. v_unpad,
  199. cu_seqlens_q,
  200. cu_seqlens_k,
  201. max_seqlen_q,
  202. max_seqlen_k,
  203. q,
  204. k,
  205. v,
  206. output_pad_fn,
  207. dq_pad_fn,
  208. dk_pad_fn,
  209. ) = generate_qkv(q, k, v, query_padding_mask, key_padding_mask, kvpacked=False)
  210. # print("cu_seqlens_q: ", cu_seqlens_q)
  211. # print("cu_seqlens_k: ", cu_seqlens_k)
  212. # print("q_unpad, shape: ", q_unpad.shape)
  213. # print("k_unpad, shape: ", k_unpad.shape)
  214. # print("v_unpad, shape: ", v_unpad.shape)
  215. out_unpad, sm_lse = flash_attn_varlen_func(
  216. q_unpad,
  217. k_unpad,
  218. v_unpad,
  219. cu_seqlens_q,
  220. cu_seqlens_k,
  221. max_seqlen_q,
  222. max_seqlen_k,
  223. causal=causal,
  224. )
  225. out = output_pad_fn(out_unpad)
  226. dropout_mask = None
  227. out_ref, attn_ref = attention_ref(
  228. q,
  229. k,
  230. v,
  231. query_padding_mask,
  232. key_padding_mask,
  233. causal=causal,
  234. )
  235. out_pt, attn_pt = attention_ref(
  236. q,
  237. k,
  238. v,
  239. query_padding_mask,
  240. key_padding_mask,
  241. causal=causal,
  242. upcast=False,
  243. reorder_ops=True,
  244. )
  245. print(f"Output max diff: {(out - out_ref).abs().max().item()}")
  246. print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
  247. print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}")
  248. print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}")
  249. @pytest.mark.parametrize("dtype", [torch.float8_e4m3fn])
  250. # @pytest.mark.parametrize("dtype", [torch.bfloat16])
  251. @pytest.mark.parametrize("mha_type", ["mha", "mqa", "gqa"])
  252. # @pytest.mark.parametrize("mha_type", ["gqa"])
  253. @pytest.mark.parametrize("causal", [False, True])
  254. # @pytest.mark.parametrize("causal", [True])
  255. # @pytest.mark.parametrize("d", [32, 64, 96, 128, 160, 192, 224, 256])
  256. # @pytest.mark.parametrize('d', [32, 40, 64, 80, 96, 128, 160, 192])
  257. # @pytest.mark.parametrize('d', [32, 64, 96, 128, 160, 192])
  258. # @pytest.mark.parametrize('d', [56, 80])
  259. @pytest.mark.parametrize("d", [64, 128, 256])
  260. #@pytest.mark.parametrize("d", [128])
  261. # @pytest.mark.parametrize("d", [256])
  262. @pytest.mark.parametrize(
  263. "seqlen_q,seqlen_k",
  264. [
  265. (64, 128),
  266. (128, 128),
  267. (256, 256),
  268. (113, 203),
  269. (128, 217),
  270. (113, 211),
  271. (108, 256),
  272. (256, 512),
  273. (384, 256),
  274. (640, 128),
  275. (512, 256),
  276. (1024, 1024),
  277. (1023, 1024),
  278. (1024, 1023),
  279. (2048, 2048),
  280. ],
  281. )
  282. def test_flash_attn_output_fp8(
  283. seqlen_q, seqlen_k, d, causal, mha_type, dtype
  284. ):
  285. device = "cuda"
  286. # set seed
  287. torch.random.manual_seed(0)
  288. # batch_size = 40
  289. # nheads = 16
  290. batch_size = 9
  291. nheads = 6
  292. nheads_kv = 6 if mha_type == "mha" else (2 if mha_type == "gqa" else 1)
  293. # batch_size = 1
  294. # nheads = 1
  295. q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=torch.float16, requires_grad=True)
  296. k = torch.randn(batch_size, seqlen_k, nheads_kv, d, device=device, dtype=torch.float16, requires_grad=True)
  297. v = torch.randn(batch_size, seqlen_k, nheads_kv, d, device=device, dtype=torch.float16, requires_grad=True)
  298. out, lse = flash_attn_func(q.to(dtype), k.to(dtype), v.to(dtype).transpose(1,3).contiguous().clone(), causal=causal)
  299. q = q.to(dtype).to(torch.float16)
  300. k = k.to(dtype).to(torch.float16)
  301. v = v.to(dtype).to(torch.float16)
  302. out_ref, attn_ref = attention_ref(
  303. q,
  304. k,
  305. v,
  306. None,
  307. None,
  308. causal=causal,
  309. )
  310. out_pt, attn_pt = attention_ref(
  311. q,
  312. k,
  313. v,
  314. None,
  315. None,
  316. causal=causal,
  317. upcast=False,
  318. reorder_ops=True,
  319. )
  320. print(f"Output max diff: {(out - out_ref).abs().max().item()}")
  321. print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
  322. print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}")
  323. print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}")
  324. assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item()