test_util.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274
  1. import math
  2. import torch
  3. from einops import rearrange, repeat
  4. from flash_attn.bert_padding import pad_input, unpad_input
  5. def generate_random_padding_mask(max_seqlen, batch_size, device, mode="random", zero_lengths=False):
  6. assert mode in ["full", "random", "third"]
  7. if mode == "full":
  8. lengths = torch.full((batch_size, 1), max_seqlen, device=device, dtype=torch.int32)
  9. elif mode == "random":
  10. lengths = torch.randint(
  11. max(0 if zero_lengths else 1, max_seqlen - 20), max_seqlen + 1, (batch_size, 1), device=device
  12. )
  13. elif mode == "third":
  14. lengths = torch.randint(max_seqlen // 3, max_seqlen + 1, (batch_size, 1), device=device)
  15. if zero_lengths:
  16. # Generate zero-lengths every 5 batches and the last batch.
  17. for i in range(batch_size):
  18. if i % 5 == 0:
  19. lengths[i] = 0
  20. lengths[-1] = 0
  21. padding_mask = (
  22. repeat(torch.arange(max_seqlen, device=device), "s -> b s", b=batch_size) < lengths
  23. )
  24. return padding_mask
  25. def generate_qkv(
  26. q, k, v, query_padding_mask=None, key_padding_mask=None,
  27. kvpacked=False, qkvpacked=False, add_unused_qkv=False,
  28. query_unused_mask=None, key_unused_mask=None,
  29. ):
  30. """
  31. Arguments:
  32. q: (batch_size, seqlen_q, nheads, d)
  33. k: (batch_size, seqlen_k, nheads_k, d)
  34. v: (batch_size, seqlen_k, nheads_k, d)
  35. query_padding_mask: (batch_size, seqlen), bool
  36. key_padding_mask: (batch_size, seqlen), bool
  37. """
  38. assert not (kvpacked and qkvpacked)
  39. batch_size, seqlen_q, nheads, d = q.shape
  40. _, seqlen_k, nheads_k, _ = k.shape
  41. assert k.shape == (batch_size, seqlen_k, nheads_k, d)
  42. assert v.shape == (batch_size, seqlen_k, nheads_k, d)
  43. if query_unused_mask is not None or key_unused_mask is not None:
  44. assert not kvpacked
  45. assert not qkvpacked
  46. if query_padding_mask is not None:
  47. q_unpad, indices_q, cu_seqlens_q, max_seqlen_q, seqused_q = unpad_input(
  48. q, query_padding_mask, query_unused_mask,
  49. )
  50. output_pad_fn = lambda output_unpad: pad_input(
  51. output_unpad, indices_q, batch_size, seqlen_q
  52. )
  53. else:
  54. q_unpad = rearrange(q, "b s h d -> (b s) h d")
  55. cu_seqlens_q = torch.arange(
  56. 0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32, device=q_unpad.device
  57. )
  58. seqused_q = None
  59. max_seqlen_q = seqlen_q
  60. output_pad_fn = lambda output_unpad: rearrange(
  61. output_unpad, "(b s) h d -> b s h d", b=batch_size
  62. )
  63. if key_padding_mask is not None:
  64. k_unpad, indices_k, cu_seqlens_k, max_seqlen_k, seqused_k = unpad_input(k, key_padding_mask, key_unused_mask)
  65. v_unpad, _, _, _, _ = unpad_input(v, key_padding_mask, key_unused_mask)
  66. else:
  67. k_unpad = rearrange(k, "b s h d -> (b s) h d")
  68. v_unpad = rearrange(v, "b s h d -> (b s) h d")
  69. cu_seqlens_k = torch.arange(
  70. 0, (batch_size + 1) * seqlen_k, step=seqlen_k, dtype=torch.int32, device=k_unpad.device
  71. )
  72. seqused_k = None
  73. max_seqlen_k = seqlen_k
  74. if qkvpacked:
  75. assert (query_padding_mask == key_padding_mask).all()
  76. assert nheads == nheads_k
  77. qkv_unpad = torch.stack([q_unpad, k_unpad, v_unpad], dim=1)
  78. qkv = torch.stack([q, k, v], dim=2)
  79. if query_padding_mask is not None:
  80. dqkv_pad_fn = lambda dqkv_unpad: pad_input(dqkv_unpad, indices_q, batch_size, seqlen_q)
  81. else:
  82. dqkv_pad_fn = lambda dqkv_unpad: rearrange(
  83. dqkv_unpad, "(b s) t h d -> b s t h d", b=batch_size
  84. )
  85. return (
  86. qkv_unpad.detach().requires_grad_(),
  87. cu_seqlens_q,
  88. max_seqlen_q,
  89. qkv.detach().requires_grad_(),
  90. output_pad_fn,
  91. dqkv_pad_fn,
  92. )
  93. elif kvpacked:
  94. kv_unpad = torch.stack([k_unpad, v_unpad], dim=1)
  95. kv = torch.stack([k, v], dim=2)
  96. dq_pad_fn = output_pad_fn
  97. if key_padding_mask is not None:
  98. dkv_pad_fn = lambda dkv_unpad: pad_input(dkv_unpad, indices_k, batch_size, seqlen_k)
  99. else:
  100. dkv_pad_fn = lambda dkv_unpad: rearrange(
  101. dkv_unpad, "(b s) t h d -> b s t h d", b=batch_size
  102. )
  103. return (
  104. q_unpad.detach().requires_grad_(),
  105. kv_unpad.detach().requires_grad_(),
  106. cu_seqlens_q,
  107. cu_seqlens_k,
  108. max_seqlen_q,
  109. max_seqlen_k,
  110. q.detach().requires_grad_(),
  111. kv.detach().requires_grad_(),
  112. output_pad_fn,
  113. dq_pad_fn,
  114. dkv_pad_fn,
  115. )
  116. else:
  117. dq_pad_fn = output_pad_fn
  118. if key_padding_mask is not None:
  119. dk_pad_fn = lambda dk_unpad: pad_input(dk_unpad, indices_k, batch_size, seqlen_k)
  120. else:
  121. dk_pad_fn = lambda dk_unpad: rearrange(dk_unpad, "(b s) h d -> b s h d", b=batch_size)
  122. return (
  123. q_unpad.detach().requires_grad_(),
  124. k_unpad.detach().requires_grad_(),
  125. v_unpad.detach().requires_grad_(),
  126. cu_seqlens_q,
  127. cu_seqlens_k,
  128. seqused_q,
  129. seqused_k,
  130. max_seqlen_q,
  131. max_seqlen_k,
  132. q.detach().requires_grad_(),
  133. k.detach().requires_grad_(),
  134. v.detach().requires_grad_(),
  135. output_pad_fn,
  136. dq_pad_fn,
  137. dk_pad_fn,
  138. )
  139. def construct_local_mask(
  140. seqlen_q,
  141. seqlen_k,
  142. window_size=(-1, -1), # -1 means infinite window size
  143. query_padding_mask=None,
  144. key_padding_mask=None,
  145. device=None,
  146. key_leftpad=None,
  147. ):
  148. row_idx = rearrange(torch.arange(seqlen_q, device=device, dtype=torch.long), "s -> s 1")
  149. col_idx = torch.arange(seqlen_k, device=device, dtype=torch.long)
  150. if key_leftpad is not None:
  151. key_leftpad = rearrange(key_leftpad, "b -> b 1 1 1")
  152. col_idx = repeat(col_idx, "s -> b 1 1 s", b=key_leftpad.shape[0])
  153. col_idx = torch.where(col_idx >= key_leftpad, col_idx - key_leftpad, 2**32)
  154. sk = (
  155. seqlen_k
  156. if key_padding_mask is None
  157. else rearrange(key_padding_mask.sum(-1), "b -> b 1 1 1")
  158. )
  159. sq = (
  160. seqlen_q
  161. if query_padding_mask is None
  162. else rearrange(query_padding_mask.sum(-1), "b -> b 1 1 1")
  163. )
  164. if window_size[0] < 0:
  165. return col_idx > row_idx + sk - sq + window_size[1]
  166. else:
  167. sk = torch.full_like(col_idx, seqlen_k) if key_padding_mask is None else sk
  168. return torch.logical_or(
  169. col_idx > torch.minimum(row_idx + sk - sq + window_size[1], sk),
  170. col_idx < row_idx + sk - sq - window_size[0],
  171. )
  172. def attention_ref(
  173. q,
  174. k,
  175. v,
  176. query_padding_mask=None,
  177. key_padding_mask=None,
  178. attn_bias=None,
  179. dropout_p=0.0,
  180. dropout_mask=None,
  181. causal=False,
  182. window_size=(-1, -1), # -1 means infinite window size
  183. softcap=0.0,
  184. upcast=True,
  185. reorder_ops=False,
  186. key_leftpad=None,
  187. ):
  188. """
  189. Arguments:
  190. q: (batch_size, seqlen_q, nheads, head_dim)
  191. k: (batch_size, seqlen_k, nheads_k, head_dim)
  192. v: (batch_size, seqlen_k, nheads_k, head_dim)
  193. query_padding_mask: (batch_size, seqlen_q)
  194. key_padding_mask: (batch_size, seqlen_k)
  195. attn_bias: broadcastable to (batch_size, nheads, seqlen_q, seqlen_k)
  196. dropout_p: float
  197. dropout_mask: (batch_size, nheads, seqlen_q, seqlen_k)
  198. causal: whether to apply causal masking
  199. window_size: (int, int), left and right window size
  200. upcast: whether to cast all inputs to fp32, do all computation in fp32, then cast
  201. output back to fp16/bf16.
  202. reorder_ops: whether to change the order of operations (scaling k instead of scaling q, etc.)
  203. without changing the math. This is to estimate the numerical error from operation
  204. reordering.
  205. Output:
  206. output: (batch_size, seqlen_q, nheads, head_dim)
  207. attention: (batch_size, nheads, seqlen_q, seqlen_k), softmax after dropout
  208. """
  209. if causal:
  210. window_size = (window_size[0], 0)
  211. dtype_og = q.dtype
  212. if upcast:
  213. q, k, v = q.float(), k.float(), v.float()
  214. seqlen_q, seqlen_k = q.shape[1], k.shape[1]
  215. k = repeat(k, "b s h d -> b s (h g) d", g=q.shape[2] // k.shape[2])
  216. v = repeat(v, "b s h d -> b s (h g) d", g=q.shape[2] // v.shape[2])
  217. d = q.shape[-1]
  218. if not reorder_ops:
  219. scores = torch.einsum("bthd,bshd->bhts", q / math.sqrt(d), k)
  220. else:
  221. scores = torch.einsum("bthd,bshd->bhts", q, k / math.sqrt(d))
  222. if softcap > 0:
  223. scores /= softcap
  224. scores = scores.tanh()
  225. scores *= softcap
  226. if key_padding_mask is not None:
  227. scores.masked_fill_(rearrange(~key_padding_mask, "b s -> b 1 1 s"), float("-inf"))
  228. if window_size[0] >= 0 or window_size[1] >= 0:
  229. local_mask = construct_local_mask(
  230. seqlen_q,
  231. seqlen_k,
  232. window_size,
  233. query_padding_mask,
  234. key_padding_mask,
  235. q.device,
  236. key_leftpad=key_leftpad,
  237. )
  238. scores.masked_fill_(local_mask, float("-inf"))
  239. if attn_bias is not None:
  240. scores = scores + attn_bias
  241. attention = torch.softmax(scores, dim=-1).to(v.dtype)
  242. # Some rows might be completely masked out so we fill them with zero instead of NaN
  243. if window_size[0] >= 0 or window_size[1] >= 0:
  244. attention = attention.masked_fill(torch.all(local_mask, dim=-1, keepdim=True), 0.0)
  245. # We want to mask here so that the attention matrix doesn't have any NaNs
  246. # Otherwise we'll get NaN in dV
  247. if query_padding_mask is not None:
  248. attention = attention.masked_fill(rearrange(~query_padding_mask, "b s -> b 1 s 1"), 0.0)
  249. dropout_scaling = 1.0 / (1 - dropout_p)
  250. # attention_drop = attention.masked_fill(~dropout_mask, 0.0) * dropout_scaling
  251. # output = torch.einsum('bhts,bshd->bthd', attention_drop , v)
  252. if dropout_mask is not None:
  253. attention_drop = attention.masked_fill(~dropout_mask, 0.0)
  254. else:
  255. attention_drop = attention
  256. output = torch.einsum("bhts,bshd->bthd", attention_drop, v * dropout_scaling)
  257. if query_padding_mask is not None:
  258. output.masked_fill_(rearrange(~query_padding_mask, "b s -> b s 1 1"), 0.0)
  259. if key_padding_mask is not None:
  260. output.masked_fill_(rearrange(torch.logical_not(torch.any(key_padding_mask, 1)), "b -> b 1 1 1"), 0.0)
  261. return output.to(dtype=dtype_og), attention.to(dtype=dtype_og)