test_btlm.py 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245
  1. # Copyright (c) 2023, Tri Dao.
  2. import time
  3. import torch
  4. import pytest
  5. from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
  6. from flash_attn.models.gpt import GPTLMHeadModel
  7. from flash_attn.models.btlm import btlm_config_to_gpt2_config, remap_state_dict_hf_btlm
  8. from flash_attn.utils.pretrained import state_dict_from_pretrained
  9. from flash_attn.utils.generation import update_graph_cache
  10. @pytest.mark.parametrize("model_name", ["cerebras/btlm-3b-8k-base"])
  11. def test_btlm_state_dict(model_name):
  12. config = btlm_config_to_gpt2_config(
  13. AutoConfig.from_pretrained(model_name, trust_remote_code=True)
  14. )
  15. pretrained_state_dict = remap_state_dict_hf_btlm(state_dict_from_pretrained(model_name), config)
  16. model = GPTLMHeadModel(config, device="meta") # Without device='meta' init is very slow
  17. state_dict = model.state_dict()
  18. assert len(state_dict.keys()) == len(pretrained_state_dict.keys())
  19. assert state_dict.keys() == pretrained_state_dict.keys()
  20. for k in state_dict.keys():
  21. assert state_dict[k].shape == pretrained_state_dict[k].shape
  22. @pytest.mark.parametrize("model_name", ["cerebras/btlm-3b-8k-base"])
  23. def test_btlm_optimized(model_name):
  24. """Check that our implementation of Btlm (with all optimizations enabled) matches the
  25. HF implementation: the output of our forward pass in fp16 should be around the same as the HF
  26. forward pass in fp16, when compared to the HF forward pass in fp32.
  27. """
  28. dtype = torch.float16
  29. device = "cuda"
  30. config = btlm_config_to_gpt2_config(
  31. AutoConfig.from_pretrained(model_name, trust_remote_code=True)
  32. )
  33. config.fused_bias_fc = True
  34. config.fused_dropout_add_ln = True
  35. config.residual_in_fp32 = True
  36. pretrained_state_dict = remap_state_dict_hf_btlm(state_dict_from_pretrained(model_name), config)
  37. model = GPTLMHeadModel(config, device=device, dtype=dtype)
  38. model.load_state_dict(pretrained_state_dict)
  39. model.eval()
  40. torch.manual_seed(0)
  41. batch_size = 2
  42. max_seqlen = 256
  43. seqlens = torch.randint(max_seqlen // 2, max_seqlen + 1, (batch_size,), device=device)
  44. input_ids = torch.randint(
  45. 0, config.vocab_size, (batch_size, max_seqlen), dtype=torch.long, device=device
  46. )
  47. with torch.no_grad():
  48. out = model.transformer(input_ids)
  49. logits = model(input_ids).logits
  50. del model
  51. # Without device_map, the model is loaded on the CPU, which is very slow
  52. # Need auto here since the 13B fp32 model doesn't fit in memory on a A100 40GB
  53. model_ref = AutoModelForCausalLM.from_pretrained(
  54. model_name, device_map="auto", trust_remote_code=True
  55. )
  56. model_ref.eval()
  57. with torch.no_grad():
  58. out_ref = model_ref.transformer(input_ids).last_hidden_state.to(device=device)
  59. logits_ref = model_ref(input_ids).logits.to(device=device)
  60. del model_ref
  61. model_hf = AutoModelForCausalLM.from_pretrained(
  62. model_name,
  63. torch_dtype=dtype,
  64. device_map={"": device},
  65. trust_remote_code=True,
  66. )
  67. model_hf.eval()
  68. with torch.no_grad():
  69. out_hf = model_hf.transformer(input_ids).last_hidden_state
  70. logits_hf = model_hf(input_ids).logits
  71. del model_hf
  72. print(f"Output max diff: {(out - out_ref).abs().max().item()}")
  73. print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
  74. print(f"HF fp16 max diff: {(out_hf - out_ref).abs().max().item()}")
  75. print(f"HF fp16 mean diff: {(out_hf - out_ref).abs().mean().item()}")
  76. assert (out - out_ref).abs().max().item() < 3 * (out_hf - out_ref).abs().max().item()
  77. print(f"Logits max diff: {(logits - logits_ref).abs().max().item()}")
  78. print(f"Logits mean diff: {(logits - logits_ref).abs().mean().item()}")
  79. print(f"HF fp16 max diff: {(logits_hf - logits_ref).abs().max().item()}")
  80. print(f"HF fp16 mean diff: {(logits_hf - logits_ref).abs().mean().item()}")
  81. assert (logits - logits_ref).abs().max().item() < 3 * (
  82. logits_hf - logits_ref
  83. ).abs().max().item()
  84. @pytest.mark.parametrize("model_name", ["cerebras/btlm-3b-8k-base"])
  85. def test_btlm_generation(model_name):
  86. dtype = torch.float16
  87. device = "cuda"
  88. config = btlm_config_to_gpt2_config(
  89. AutoConfig.from_pretrained(model_name, trust_remote_code=True)
  90. )
  91. config.fused_bias_fc = True
  92. config.fused_dropout_add_ln = True
  93. config.residual_in_fp32 = True
  94. tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
  95. eos_token_id = tokenizer.eos_token_id
  96. torch.manual_seed(0)
  97. batch_size = 1
  98. seqlen = 2048
  99. max_length = 2048 + 150
  100. input_ids = torch.randint(
  101. 0, config.vocab_size, (batch_size, seqlen), dtype=torch.long, device=device
  102. )
  103. model_hf = AutoModelForCausalLM.from_pretrained(
  104. model_name, torch_dtype=dtype, device_map={"": device}, trust_remote_code=True
  105. )
  106. model_hf.eval()
  107. print("HF fp16")
  108. torch.cuda.synchronize()
  109. start = time.time()
  110. out_hf = model_hf.generate(
  111. input_ids=input_ids,
  112. max_length=max_length,
  113. return_dict_in_generate=True,
  114. output_scores=True,
  115. )
  116. torch.cuda.synchronize()
  117. print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms")
  118. del model_hf
  119. # Need auto here since the 13B fp32 model doesn't fit in memory on a A100 40GB
  120. model_ref = AutoModelForCausalLM.from_pretrained(
  121. model_name, device_map="auto", trust_remote_code=True
  122. )
  123. model_ref.eval()
  124. with torch.no_grad():
  125. logits_ref = model_ref(out_hf.sequences).logits[:, (seqlen - 1) : -1].to(device=device)
  126. del model_ref
  127. pretrained_state_dict = remap_state_dict_hf_btlm(state_dict_from_pretrained(model_name), config)
  128. model = GPTLMHeadModel(config, device=device, dtype=dtype)
  129. model.load_state_dict(pretrained_state_dict)
  130. model.eval()
  131. model(input_ids) # Warm up
  132. print("Without CUDA graph")
  133. torch.cuda.synchronize()
  134. start = time.time()
  135. out = model.generate(
  136. input_ids=input_ids,
  137. max_length=max_length,
  138. eos_token_id=eos_token_id,
  139. return_dict_in_generate=True,
  140. output_scores=True,
  141. enable_timing=True,
  142. teacher_outputs=out_hf.sequences,
  143. )
  144. torch.cuda.synchronize()
  145. print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms")
  146. # Capture graph outside the timing loop
  147. batch_size, seqlen_og = input_ids.shape
  148. model._decoding_cache = update_graph_cache(model, None, batch_size, seqlen_og, max_length)
  149. print("With CUDA graph")
  150. torch.cuda.synchronize()
  151. start = time.time()
  152. out_cg = model.generate(
  153. input_ids=input_ids,
  154. max_length=max_length,
  155. cg=True,
  156. return_dict_in_generate=True,
  157. output_scores=True,
  158. enable_timing=True,
  159. teacher_outputs=out_hf.sequences,
  160. )
  161. torch.cuda.synchronize()
  162. print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms")
  163. with torch.no_grad():
  164. logits_parallel = model(out_hf.sequences).logits[:, (seqlen - 1) : -1]
  165. logits_hf = torch.stack(out_hf.scores, dim=1)
  166. logits = torch.stack(out.scores, dim=1)
  167. logits_cg = torch.stack(out_cg.scores, dim=1)
  168. del model
  169. hf_error = (logits_hf - logits_ref).abs().max().item()
  170. print(f"HF fp16 logits max diff: {hf_error}")
  171. print(f"Logits max diff: {(logits - logits_ref).abs().max().item() }")
  172. print(f"Logits CG max diff: {(logits_cg - logits_ref).abs().max().item() }")
  173. assert (logits_parallel - logits_ref).abs().max().item() < 2 * hf_error
  174. assert (logits - logits_ref).abs().max().item() < 2 * hf_error
  175. assert torch.equal(logits_cg, logits)
  176. @pytest.mark.parametrize("model_name", ["cerebras/btlm-3b-8k-base"])
  177. def test_btlm_init(model_name):
  178. dtype = torch.float32
  179. device = "cuda"
  180. btlm_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
  181. config = btlm_config_to_gpt2_config(btlm_config)
  182. model = GPTLMHeadModel(config, device=device, dtype=dtype)
  183. model_ref = AutoModelForCausalLM.from_config(btlm_config, trust_remote_code=True).to(device)
  184. assert model.transformer.embeddings.word_embeddings.weight.mean().abs() < 1e-4
  185. assert (
  186. model.transformer.embeddings.word_embeddings.weight.std()
  187. - model_ref.transformer.wte.weight.std()
  188. ).abs() < 1e-4
  189. assert model.lm_head.weight.mean().abs() < 1e-4
  190. assert (model.lm_head.weight.std() - model_ref.lm_head.weight.std()).abs() < 1e-4
  191. for l in range(config.n_layer):
  192. assert model.transformer.layers[l].mixer.Wqkv.weight.mean().abs() < 1e-4
  193. assert (
  194. model.transformer.layers[l].mixer.Wqkv.weight.std()
  195. - model_ref.transformer.h[l].attn.c_attn.weight.std()
  196. ).abs() < 1e-4
  197. assert model.transformer.layers[l].mixer.Wqkv.bias.abs().max() == 0.0
  198. assert model.transformer.layers[l].mixer.out_proj.weight.mean().abs() < 1e-4
  199. assert (
  200. model.transformer.layers[l].mixer.out_proj.weight.std()
  201. - model_ref.transformer.h[l].attn.c_proj.weight.std()
  202. ).abs() < 1e-4
  203. assert model.transformer.layers[l].mixer.out_proj.bias.abs().max() == 0.0
  204. assert model.transformer.layers[l].mlp.fc1.weight.mean().abs() < 1e-4
  205. assert (
  206. model.transformer.layers[l].mlp.fc1.weight.std()
  207. - model_ref.transformer.h[l].mlp.c_fc.weight.std()
  208. ).abs() < 1e-4
  209. assert model.transformer.layers[l].mlp.fc1.bias.abs().max() == 0.0
  210. assert model.transformer.layers[l].mlp.fc2.weight.mean().abs() < 1e-4
  211. assert (
  212. model.transformer.layers[l].mlp.fc2.weight.std()
  213. - model_ref.transformer.h[l].mlp.c_proj.weight.std()
  214. ).abs() < 1e-4
  215. assert model.transformer.layers[l].mlp.fc2.bias.abs().max() == 0.0