olmo.py 2.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273
  1. # coding=utf-8
  2. # adapted from https://github.com/allenai/OLMo/blob/v0.2.4/hf_olmo/configuration_olmo.py
  3. """OLMo configuration"""
  4. from transformers import PretrainedConfig
  5. class OLMoConfig(PretrainedConfig):
  6. model_type = 'olmo'
  7. attribute_map = {
  8. 'num_attention_heads': 'n_heads',
  9. 'hidden_size': 'd_model',
  10. 'num_hidden_layers': 'n_layers',
  11. }
  12. # Note that the defaults for these attributes are equivalent to the
  13. # base GPT2 model.
  14. def __init__(
  15. self,
  16. d_model=768,
  17. n_heads=12,
  18. n_layers=12,
  19. mlp_ratio=4,
  20. mlp_hidden_size=None,
  21. activation_type="swiglu",
  22. block_type="sequential",
  23. block_group_size=1,
  24. alibi=False,
  25. alibi_bias_max=8.0,
  26. rope=False,
  27. rope_full_precision=True,
  28. multi_query_attention=False,
  29. attention_layer_norm=False,
  30. layer_norm_type="default",
  31. layer_norm_with_affine=True,
  32. attention_layer_norm_with_affine=True,
  33. max_sequence_length=1024,
  34. include_bias=True,
  35. bias_for_layer_norm=None,
  36. scale_logits=False,
  37. vocab_size=50257,
  38. embedding_size=50304,
  39. weight_tying=True,
  40. eos_token_id=50256,
  41. pad_token_id=50256,
  42. **kwargs,
  43. ):
  44. self.d_model = d_model
  45. self.n_heads = n_heads
  46. self.n_layers = n_layers
  47. self.mlp_ratio = mlp_ratio
  48. self.mlp_hidden_size = mlp_hidden_size
  49. self.activation_type = activation_type
  50. self.block_type = block_type
  51. self.block_group_size = block_group_size
  52. self.alibi = alibi
  53. self.alibi_bias_max = alibi_bias_max
  54. self.rope = rope
  55. self.rope_full_precision = rope_full_precision
  56. self.multi_query_attention = multi_query_attention
  57. self.attention_layer_norm = attention_layer_norm
  58. self.layer_norm_type = layer_norm_type
  59. self.layer_norm_with_affine = layer_norm_with_affine
  60. self.attention_layer_norm_with_affine = attention_layer_norm_with_affine
  61. self.max_sequence_length = max_sequence_length
  62. self.include_bias = include_bias
  63. self.bias_for_layer_norm = bias_for_layer_norm
  64. self.scale_logits = scale_logits
  65. self.vocab_size = vocab_size
  66. self.embedding_size = embedding_size
  67. self.weight_tying = weight_tying
  68. self.eos_token_id = eos_token_id
  69. self.pad_token_id = pad_token_id
  70. super().__init__(**kwargs)