openwebtext.yaml 455 B

123456789101112131415
  1. _target_: src.datamodules.language_modeling_hf.LMDataModule
  2. dataset_name: openwebtext
  3. dataset_config_name: null
  4. tokenizer_name: gpt2
  5. cache_dir: ${oc.env:DATA_DIR,${data_dir}}/openwebtext/cache
  6. max_length: 1024
  7. val_ratio: 0.0005
  8. val_split_seed: 2357
  9. add_eos: True
  10. batch_size: 8 # per GPU
  11. batch_size_eval: ${eval:${.batch_size} * 2}
  12. num_workers: 32 # For preprocessing only
  13. shuffle: True
  14. pin_memory: True
  15. __train_len: ${div_up:9035582198, ${.max_length}}