1
0

olmo.py 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. # coding=utf-8
  2. # adapted from https://github.com/allenai/OLMo/blob/v0.2.4/hf_olmo/configuration_olmo.py
  3. """OLMo configuration"""
  4. from transformers import PretrainedConfig
  5. class OLMoConfig(PretrainedConfig):
  6. model_type = 'olmo'
  7. attribute_map = {
  8. 'num_attention_heads': 'n_heads',
  9. 'hidden_size': 'd_model',
  10. 'num_hidden_layers': 'n_layers',
  11. }
  12. # Note that the defaults for these attributes are equivalent to the base GPT2 model.
  13. def __init__(
  14. self,
  15. d_model=768,
  16. n_heads=12,
  17. n_layers=12,
  18. mlp_ratio=4,
  19. mlp_hidden_size=None,
  20. activation_type="swiglu",
  21. block_type="sequential",
  22. block_group_size=1,
  23. alibi=False,
  24. alibi_bias_max=8.0,
  25. rope=False,
  26. rope_full_precision=True,
  27. multi_query_attention=False,
  28. attention_layer_norm=False,
  29. layer_norm_type="default",
  30. layer_norm_with_affine=True,
  31. attention_layer_norm_with_affine=True,
  32. max_sequence_length=1024,
  33. include_bias=True,
  34. bias_for_layer_norm=None,
  35. scale_logits=False,
  36. vocab_size=50257,
  37. embedding_size=50304,
  38. weight_tying=True,
  39. eos_token_id=50256,
  40. pad_token_id=50256,
  41. **kwargs,
  42. ):
  43. self.d_model = d_model
  44. self.n_heads = n_heads
  45. self.n_layers = n_layers
  46. self.mlp_ratio = mlp_ratio
  47. self.mlp_hidden_size = mlp_hidden_size
  48. self.activation_type = activation_type
  49. self.block_type = block_type
  50. self.block_group_size = block_group_size
  51. self.alibi = alibi
  52. self.alibi_bias_max = alibi_bias_max
  53. self.rope = rope
  54. self.rope_full_precision = rope_full_precision
  55. self.multi_query_attention = multi_query_attention
  56. self.attention_layer_norm = attention_layer_norm
  57. self.layer_norm_type = layer_norm_type
  58. self.layer_norm_with_affine = layer_norm_with_affine
  59. self.attention_layer_norm_with_affine = attention_layer_norm_with_affine
  60. self.max_sequence_length = max_sequence_length
  61. self.include_bias = include_bias
  62. self.bias_for_layer_norm = bias_for_layer_norm
  63. self.scale_logits = scale_logits
  64. self.vocab_size = vocab_size
  65. self.embedding_size = embedding_size
  66. self.weight_tying = weight_tying
  67. self.eos_token_id = eos_token_id
  68. self.pad_token_id = pad_token_id
  69. super().__init__(**kwargs)