1
0

base.yaml 2.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. # @package _global_
  2. defaults:
  3. - override /trainer: default # choose trainer from 'configs/trainer/'
  4. - override /model: null
  5. - override /datamodule: thepile
  6. - override /optimizer: adamw-apex # slight speedup (1-2%) over Pytorch AdamW
  7. - override /scheduler: cosine-warmup-timm
  8. - override /callbacks: [default, norm-monitor]
  9. - override /metrics: [perplexity, num-tokens]
  10. - override /logger: wandb
  11. # all parameters below will be merged with parameters from default configurations set above
  12. # this allows you to overwrite only specified parameters
  13. task:
  14. _target_: src.tasks.seq.SequenceLMModel
  15. seed: 1111
  16. trainer:
  17. accelerator: gpu
  18. devices: 8
  19. num_nodes: 1
  20. accumulate_grad_batches: ${div_up:${train.global_batch_size}, ${eval:${trainer.devices} * ${datamodule.batch_size} * ${trainer.num_nodes}}}
  21. max_steps: 800000
  22. val_check_interval: ${eval:2000 * ${.accumulate_grad_batches}}
  23. check_val_every_n_epoch: null # We don't care about epoch boundary
  24. precision: bf16
  25. gradient_clip_val: 1.0
  26. strategy: null
  27. datamodule:
  28. batch_size: 16 # Per GPU
  29. batch_size_eval: ${.batch_size} # Fused dense only support batch size at most 64k
  30. max_length: 2048
  31. fault_tolerant: True
  32. ddp: ${eval:"${trainer.devices} > 1"}
  33. train:
  34. gpu_mem: ${eval:"round(float(__import__('subprocess').check_output('nvidia-smi -i 0 --query-gpu=memory.total --format=csv,noheader,nounits', shell=True).strip().decode()) / 1000)"}
  35. global_batch_size: 256
  36. optimizer:
  37. lr: 6e-4
  38. weight_decay: 0.1
  39. optimizer_param_grouping:
  40. bias_weight_decay: False
  41. normalization_weight_decay: False
  42. scheduler:
  43. t_in_epochs: False
  44. t_initial: 600000
  45. warmup_lr_init: 1e-6
  46. warmup_t: ${eval:0.01 * ${trainer.max_steps}}
  47. lr_min: ${eval:0.1 * ${train.optimizer.lr}}
  48. loss_fn:
  49. # This is faster and uses less memory than torch.nn.CrossEntropyLoss.
  50. # It's also more numerically stable if we're using DeepSpeed 16 bits.
  51. _target_: flash_attn.losses.cross_entropy.CrossEntropyLoss
  52. inplace_backward: True # to save memory
  53. eval:
  54. log_on_step: True # 1 training epoch takes too long, we want to see metrics per train step
  55. callbacks:
  56. model_checkpoint:
  57. monitor: val/loss
  58. mode: min
  59. save_top_k: 3
  60. save_last: True
  61. every_n_train_steps: 1000
  62. dirpath: ${work_dir}/checkpoints/${oc.select:name,''}
  63. filename: step_{step}
  64. auto_insert_metric_name: False
  65. model_checkpoint_progress:
  66. _target_: src.callbacks.model_checkpoint.ModelCheckpointMine
  67. # fault_tolerant: True # The .pl_auto_save.ckpt doesn't get saved by all workers
  68. every_n_train_steps: 50000
  69. save_last: False
  70. save_top_k: -1 # Save all the checkpoints
  71. dirpath: ${..model_checkpoint.dirpath}
  72. filename: progress_step_{step}
  73. auto_insert_metric_name: False
  74. early_stopping: null