thepile.yaml 429 B

1234567891011121314
  1. _target_: src.datamodules.language_modeling_hf.LMDataModule
  2. dataset_name: the_pile
  3. dataset_config_name: null
  4. tokenizer_name: gpt2
  5. cache_dir: ${oc.env:DATA_DIR,${data_dir}}/the_pile/cache
  6. max_length: 2048
  7. add_eos: True
  8. batch_size: 4 # per GPU
  9. batch_size_eval: ${eval:${.batch_size} * 2}
  10. num_workers: 64 # For preprocessing only
  11. use_shmem: False
  12. shuffle: True
  13. pin_memory: True
  14. __train_len: ${div_up:374337375694, ${.max_length}}