calib_dataloader.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303
  1. # Copyright (c) OpenMMLab. All rights reserved.
  2. import numpy as np
  3. import torch
  4. def set_seed(seed):
  5. np.random.seed(seed)
  6. torch.random.manual_seed(seed)
  7. def get_wikitext2(tokenizer, nsamples, seed, seqlen, path=None):
  8. """Load Wikitext-2 train and test datasets and tokenize.
  9. Args:
  10. tokenizer: Tokenizer to encode text.
  11. nsamples: Number of samples to take from train set.
  12. seed: Random seed for sampling.
  13. seqlen: Maximum sequence length.
  14. Returns:
  15. train_loader: List of sampled and tokenized training examples.
  16. test_enc: Full tokenized Wikitext-2 test set.
  17. """
  18. from datasets import load_dataset
  19. traindata = load_dataset(path if path else 'wikitext',
  20. 'wikitext-2-raw-v1',
  21. split='train')
  22. testdata = load_dataset(path if path else 'wikitext',
  23. 'wikitext-2-raw-v1',
  24. split='test')
  25. trainenc = tokenizer('\n\n'.join(traindata['text']), return_tensors='pt')
  26. testenc = tokenizer('\n\n'.join(testdata['text']), return_tensors='pt')
  27. import random
  28. random.seed(seed)
  29. trainloader = []
  30. for _ in range(nsamples):
  31. i = random.randint(0, trainenc.input_ids.shape[1] - seqlen)
  32. j = i + seqlen
  33. inp = trainenc.input_ids[:, i:j]
  34. tar = inp.clone()
  35. tar[:, :-1] = -100
  36. trainloader.append((inp, tar))
  37. return trainloader, testenc
  38. def get_ptb(tokenizer, nsamples, seed, seqlen):
  39. """Load PTB train and validation datasets and tokenize.
  40. Args:
  41. tokenizer: Tokenizer to encode text.
  42. nsamples: Number of samples to take from train set.
  43. seed: Random seed for sampling.
  44. seqlen: Maximum sequence length.
  45. Returns:
  46. train_loader: List of sampled and tokenized training examples.
  47. test_enc: Full tokenized PTB validation set.
  48. """
  49. from datasets import load_dataset
  50. traindata = load_dataset('ptb_text_only', 'penn_treebank', split='train')
  51. valdata = load_dataset('ptb_text_only',
  52. 'penn_treebank',
  53. split='validation')
  54. trainenc = tokenizer('\n\n'.join(traindata['sentence']),
  55. return_tensors='pt')
  56. testenc = tokenizer('\n\n'.join(valdata['sentence']), return_tensors='pt')
  57. import random
  58. random.seed(seed)
  59. trainloader = []
  60. for _ in range(nsamples):
  61. i = random.randint(0, trainenc.input_ids.shape[1] - seqlen)
  62. j = i + seqlen
  63. inp = trainenc.input_ids[:, i:j]
  64. tar = inp.clone()
  65. tar[:, :-1] = -100
  66. trainloader.append((inp, tar))
  67. return trainloader, testenc
  68. def get_c4(tokenizer, nsamples, seed, seqlen, path=None):
  69. """Load C4 train and validation datasets and tokenize.
  70. Args:
  71. tokenizer: Tokenizer to encode text.
  72. nsamples: Number of samples to take from train set.
  73. seed: Random seed for sampling.
  74. seqlen: Maximum sequence length.
  75. Returns:
  76. train_loader: List of sampled and tokenized training examples.
  77. test_enc: Full tokenized PTB validation set.
  78. """
  79. from datasets import load_dataset
  80. traindata = load_dataset(
  81. path if path else 'allenai/c4',
  82. 'allenai--c4',
  83. data_files={'train': 'en/c4-train.00000-of-01024.json.gz'},
  84. split='train',
  85. use_auth_token=False)
  86. valdata = load_dataset(
  87. path if path else 'allenai/c4',
  88. 'allenai--c4',
  89. data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'},
  90. split='validation',
  91. use_auth_token=False)
  92. import random
  93. random.seed(seed)
  94. trainloader = []
  95. for _ in range(nsamples):
  96. while True:
  97. i = random.randint(0, len(traindata) - 1)
  98. trainenc = tokenizer(traindata[i]['text'], return_tensors='pt')
  99. if trainenc.input_ids.shape[1] >= seqlen:
  100. break
  101. i = random.randint(0, trainenc.input_ids.shape[1] - seqlen)
  102. j = i + seqlen
  103. inp = trainenc.input_ids[:, i:j]
  104. tar = inp.clone()
  105. tar[:, :-1] = -100
  106. trainloader.append((inp, tar))
  107. valenc = []
  108. for _ in range(256):
  109. while True:
  110. i = random.randint(0, len(valdata) - 1)
  111. tmp = tokenizer(valdata[i]['text'], return_tensors='pt')
  112. if tmp.input_ids.shape[1] >= seqlen:
  113. break
  114. i = random.randint(0, tmp.input_ids.shape[1] - seqlen)
  115. j = i + seqlen
  116. valenc.append(tmp.input_ids[:, i:j])
  117. valenc = torch.hstack(valenc)
  118. class TokenizerWrapper:
  119. def __init__(self, input_ids):
  120. self.input_ids = input_ids
  121. valenc = TokenizerWrapper(valenc)
  122. return trainloader, valenc
  123. def get_ptb_new(tokenizer, nsamples, seed, seqlen):
  124. """Load PTB New train and validation datasets and tokenize.
  125. Args:
  126. tokenizer: Tokenizer to encode text.
  127. nsamples: Number of samples to take from train set.
  128. seed: Random seed for sampling.
  129. seqlen: Maximum sequence length.
  130. Returns:
  131. train_loader: List of sampled and tokenized training examples.
  132. test_enc: Full tokenized PTB validation set.
  133. """
  134. from datasets import load_dataset
  135. traindata = load_dataset('ptb_text_only', 'penn_treebank', split='train')
  136. testdata = load_dataset('ptb_text_only', 'penn_treebank', split='test')
  137. trainenc = tokenizer(' '.join(traindata['sentence']), return_tensors='pt')
  138. testenc = tokenizer(' '.join(testdata['sentence']), return_tensors='pt')
  139. import random
  140. random.seed(seed)
  141. trainloader = []
  142. for _ in range(nsamples):
  143. i = random.randint(0, trainenc.input_ids.shape[1] - seqlen)
  144. j = i + seqlen
  145. inp = trainenc.input_ids[:, i:j]
  146. tar = inp.clone()
  147. tar[:, :-1] = -100
  148. trainloader.append((inp, tar))
  149. return trainloader, testenc
  150. def get_c4_new(tokenizer, nsamples, seed, seqlen):
  151. """Load C4 New train and validation datasets and tokenize.
  152. Args:
  153. tokenizer: Tokenizer to encode text.
  154. nsamples: Number of samples to take from train set.
  155. seed: Random seed for sampling.
  156. seqlen: Maximum sequence length.
  157. Returns:
  158. train_loader: List of sampled and tokenized training examples.
  159. test_enc: Full tokenized PTB validation set.
  160. """
  161. from datasets import load_dataset
  162. traindata = load_dataset(
  163. 'allenai/c4',
  164. 'allenai--c4',
  165. data_files={'train': 'en/c4-train.00000-of-01024.json.gz'},
  166. split='train')
  167. valdata = load_dataset(
  168. 'allenai/c4',
  169. 'allenai--c4',
  170. data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'},
  171. split='validation')
  172. import random
  173. random.seed(seed)
  174. trainloader = []
  175. for _ in range(nsamples):
  176. while True:
  177. i = random.randint(0, len(traindata) - 1)
  178. trainenc = tokenizer(traindata[i]['text'], return_tensors='pt')
  179. if trainenc.input_ids.shape[1] >= seqlen:
  180. break
  181. i = random.randint(0, trainenc.input_ids.shape[1] - seqlen)
  182. j = i + seqlen
  183. inp = trainenc.input_ids[:, i:j]
  184. tar = inp.clone()
  185. tar[:, :-1] = -100
  186. trainloader.append((inp, tar))
  187. valenc = tokenizer(' '.join(valdata[:1100]['text']), return_tensors='pt')
  188. valenc = valenc.input_ids[:, :(256 * seqlen)]
  189. class TokenizerWrapper:
  190. def __init__(self, input_ids):
  191. self.input_ids = input_ids
  192. valenc = TokenizerWrapper(valenc)
  193. return trainloader, valenc
  194. def get_pileval(tokenizer, nsamples, seed, path, seqlen=512):
  195. """Load pileval train dataset and tokenize.
  196. Args:
  197. tokenizer: Tokenizer to encode text.
  198. nsamples: Number of samples to take from train set.
  199. seed: Random seed for sampling.
  200. seqlen: Maximum sequence length.
  201. Returns:
  202. train_loader: List of sampled and tokenized training examples.
  203. test_enc: Full tokenized PTB validation set.
  204. """
  205. from datasets import load_dataset
  206. from datasets.builder import DatasetGenerationError
  207. try:
  208. dataset = load_dataset('json', data_files=path, split='train')
  209. except DatasetGenerationError as err:
  210. raise InterruptedError('There have been some issues when generating '
  211. 'the dataset, you could try to download it '
  212. 'locally first, and replace the `data_files`'
  213. 'with local addresses or use other datasets '
  214. '(c4, wiki, ptb).') from err
  215. dataset = dataset.shuffle(seed=seed)
  216. samples = []
  217. n_run = 0
  218. for data in dataset:
  219. line = data['text']
  220. line = line.strip()
  221. line_encoded = tokenizer.encode(line)
  222. if len(line_encoded) > 512:
  223. continue
  224. sample = torch.tensor([line_encoded])
  225. if sample.numel() == 0:
  226. continue
  227. samples.append(sample)
  228. n_run += 1
  229. if n_run == nsamples:
  230. break
  231. # now concatenate all samples and split according to block size
  232. cat_samples = torch.cat(samples, dim=1)
  233. n_split = cat_samples.shape[1] // seqlen
  234. print(f' * Split into {n_split} blocks')
  235. return [
  236. cat_samples[:, i * seqlen:(i + 1) * seqlen] for i in range(n_split)
  237. ], None
  238. def get_calib_loaders(name,
  239. tokenizer,
  240. nsamples=128,
  241. seed=0,
  242. seqlen=2048,
  243. path=None):
  244. """Get calibration data loaders for a dataset.
  245. Args:
  246. name: Dataset name ('wikitext2', 'ptb', 'c4', etc).
  247. tokenizer: Tokenizer to encode text.
  248. nsamples: Number of samples to take from train set.
  249. seed: Random seed for sampling.
  250. seqlen: Maximum sequence length.
  251. Returns:
  252. train_loader: List of sampled and tokenized training examples.
  253. test_data: Full tokenized validation set.
  254. """
  255. if 'wikitext2' in name:
  256. return get_wikitext2(tokenizer, nsamples, seed, seqlen, path)
  257. if 'ptb' in name:
  258. if 'new' in name:
  259. return get_ptb_new(tokenizer, nsamples, seed, seqlen)
  260. return get_ptb(tokenizer, nsamples, seed, seqlen)
  261. if 'c4' in name:
  262. if 'new' in name:
  263. return get_c4_new(tokenizer, nsamples, seed, seqlen)
  264. return get_c4(tokenizer, nsamples, seed, seqlen, path)
  265. if 'pileval' in name:
  266. if path is None:
  267. path = 'https://the-eye.eu/public/AI/pile/val.jsonl.zst'
  268. return get_pileval(tokenizer, nsamples, seed, path, seqlen)