gpt3xl-hf.yaml 691 B

1234567891011121314151617181920212223242526272829303132333435
  1. # @package _global_
  2. defaults:
  3. - /experiment/pile/gpt3s-hf.yaml
  4. - override /optimizer: adamw-zero
  5. model:
  6. config:
  7. n_embd: 2048
  8. n_head: 16
  9. n_layer: 24
  10. datamodule:
  11. batch_size: 2
  12. train:
  13. global_batch_size: 512
  14. optimizer:
  15. lr: 2.0e-4
  16. scheduler:
  17. t_initial: 300000
  18. trainer:
  19. strategy:
  20. _target_: src.utils.ddp_zero1.DDPStrategyZero1
  21. find_unused_parameters: False
  22. gradient_as_bucket_view: True
  23. max_steps: 400000
  24. val_check_interval: ${eval:1000 * ${.accumulate_grad_batches}}
  25. callbacks:
  26. model_checkpoint:
  27. every_n_train_steps: 1000
  28. model_checkpoint_progress:
  29. every_n_train_steps: 12500
  30. fault_tolerant: False # Saving takes too long