falcon.py 2.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. # Adapted from
  2. # https://huggingface.co/tiiuae/falcon-7b/blob/main/configuration_RW.py
  3. # Copyright 2023 The PygmalionAI team.
  4. # Copyright 2023 The vLLM team.
  5. # Copyright 2022 the Big Science Workshop and HuggingFace Inc. team.
  6. # All rights reserved.
  7. #
  8. # Licensed under the Apache License, Version 2.0 (the "License");
  9. # you may not use this file except in compliance with the License.
  10. # You may obtain a copy of the License at
  11. #
  12. # http://www.apache.org/licenses/LICENSE-2.0
  13. #
  14. # Unless required by applicable law or agreed to in writing, software
  15. # distributed under the License is distributed on an "AS IS" BASIS,
  16. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17. # See the License for the specific language governing permissions and
  18. # limitations under the License.
  19. """Falcon configuration"""
  20. from transformers.configuration_utils import PretrainedConfig
  21. class RWConfig(PretrainedConfig):
  22. model_type = "falcon"
  23. keys_to_ignore_at_inference = ["past_key_values"]
  24. attribute_map = {
  25. "num_hidden_layers": "n_layer",
  26. "num_attention_heads": "n_head",
  27. "num_kv_heads": "n_head_kv",
  28. }
  29. def __init__(
  30. self,
  31. vocab_size=250880,
  32. hidden_size=64,
  33. n_layer=2,
  34. n_head=8,
  35. layer_norm_epsilon=1e-5,
  36. initializer_range=0.02,
  37. use_cache=True,
  38. bos_token_id=1,
  39. eos_token_id=2,
  40. hidden_dropout=0.0,
  41. attention_dropout=0.0,
  42. multi_query=True,
  43. n_head_kv=None,
  44. alibi=False,
  45. bias=False,
  46. parallel_attn=False,
  47. new_decoder_architecture=False,
  48. **kwargs,
  49. ) -> None:
  50. self.vocab_size = vocab_size
  51. # Backward compatibility with n_embed kwarg
  52. n_embed = kwargs.pop("n_embed", None)
  53. self.hidden_size = hidden_size if n_embed is None else n_embed
  54. self.n_layer = n_layer
  55. self.n_head = n_head
  56. self.layer_norm_epsilon = layer_norm_epsilon
  57. self.initializer_range = initializer_range
  58. self.use_cache = use_cache
  59. self.hidden_dropout = hidden_dropout
  60. self.attention_dropout = attention_dropout
  61. self.bos_token_id = bos_token_id
  62. self.eos_token_id = eos_token_id
  63. self.multi_query = multi_query
  64. self.n_head_kv = 1 if n_head_kv is None else n_head_kv
  65. self.alibi = alibi
  66. self.bias = bias
  67. self.parallel_attn = parallel_attn
  68. self.new_decoder_architecture = new_decoder_architecture
  69. if self.hidden_size == 8192:
  70. # Hack for falcon-40b
  71. self.new_decoder_architecture = True
  72. super().__init__(bos_token_id=bos_token_id,
  73. eos_token_id=eos_token_id,
  74. **kwargs)
  75. @property
  76. def head_dim(self):
  77. return self.hidden_size // self.n_head
  78. @property
  79. def rotary(self):
  80. return not self.alibi