12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273 |
- # coding=utf-8
- # adapted from https://github.com/allenai/OLMo/blob/v0.2.4/hf_olmo/configuration_olmo.py
- """OLMo configuration"""
- from transformers import PretrainedConfig
- class OLMoConfig(PretrainedConfig):
- model_type = 'olmo'
- attribute_map = {
- 'num_attention_heads': 'n_heads',
- 'hidden_size': 'd_model',
- 'num_hidden_layers': 'n_layers',
- }
- # Note that the defaults for these attributes are equivalent to the
- # base GPT2 model.
- def __init__(
- self,
- d_model=768,
- n_heads=12,
- n_layers=12,
- mlp_ratio=4,
- mlp_hidden_size=None,
- activation_type="swiglu",
- block_type="sequential",
- block_group_size=1,
- alibi=False,
- alibi_bias_max=8.0,
- rope=False,
- rope_full_precision=True,
- multi_query_attention=False,
- attention_layer_norm=False,
- layer_norm_type="default",
- layer_norm_with_affine=True,
- attention_layer_norm_with_affine=True,
- max_sequence_length=1024,
- include_bias=True,
- bias_for_layer_norm=None,
- scale_logits=False,
- vocab_size=50257,
- embedding_size=50304,
- weight_tying=True,
- eos_token_id=50256,
- pad_token_id=50256,
- **kwargs,
- ):
- self.d_model = d_model
- self.n_heads = n_heads
- self.n_layers = n_layers
- self.mlp_ratio = mlp_ratio
- self.mlp_hidden_size = mlp_hidden_size
- self.activation_type = activation_type
- self.block_type = block_type
- self.block_group_size = block_group_size
- self.alibi = alibi
- self.alibi_bias_max = alibi_bias_max
- self.rope = rope
- self.rope_full_precision = rope_full_precision
- self.multi_query_attention = multi_query_attention
- self.attention_layer_norm = attention_layer_norm
- self.layer_norm_type = layer_norm_type
- self.layer_norm_with_affine = layer_norm_with_affine
- self.attention_layer_norm_with_affine = attention_layer_norm_with_affine
- self.max_sequence_length = max_sequence_length
- self.include_bias = include_bias
- self.bias_for_layer_norm = bias_for_layer_norm
- self.scale_logits = scale_logits
- self.vocab_size = vocab_size
- self.embedding_size = embedding_size
- self.weight_tying = weight_tying
- self.eos_token_id = eos_token_id
- self.pad_token_id = pad_token_id
- super().__init__(**kwargs)
|