qwen.py 1.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960
  1. # Copyright (c) Alibaba Cloud.
  2. # LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE
  3. from transformers import PretrainedConfig
  4. class QWenConfig(PretrainedConfig):
  5. model_type = "qwen"
  6. keys_to_ignore_at_inference = ["past_key_values"]
  7. def __init__(
  8. self,
  9. vocab_size=151936,
  10. hidden_size=4096,
  11. num_hidden_layers=32,
  12. num_attention_heads=32,
  13. emb_dropout_prob=0.0,
  14. attn_dropout_prob=0.0,
  15. layer_norm_epsilon=1e-6,
  16. initializer_range=0.02,
  17. max_position_embeddings=8192,
  18. scale_attn_weights=True,
  19. use_cache=True,
  20. bf16=False,
  21. fp16=False,
  22. fp32=False,
  23. kv_channels=128,
  24. rotary_pct=1.0,
  25. rotary_emb_base=10000,
  26. use_dynamic_ntk=True,
  27. use_logn_attn=True,
  28. use_flash_attn="auto",
  29. intermediate_size=22016,
  30. no_bias=True,
  31. tie_word_embeddings=False,
  32. **kwargs,
  33. ):
  34. self.vocab_size = vocab_size
  35. self.hidden_size = hidden_size
  36. self.intermediate_size = intermediate_size
  37. self.num_hidden_layers = num_hidden_layers
  38. self.num_attention_heads = num_attention_heads
  39. self.emb_dropout_prob = emb_dropout_prob
  40. self.attn_dropout_prob = attn_dropout_prob
  41. self.layer_norm_epsilon = layer_norm_epsilon
  42. self.initializer_range = initializer_range
  43. self.scale_attn_weights = scale_attn_weights
  44. self.use_cache = use_cache
  45. self.max_position_embeddings = max_position_embeddings
  46. self.bf16 = bf16
  47. self.fp16 = fp16
  48. self.fp32 = fp32
  49. self.kv_channels = kv_channels
  50. self.rotary_pct = rotary_pct
  51. self.rotary_emb_base = rotary_emb_base
  52. self.use_dynamic_ntk = use_dynamic_ntk
  53. self.use_logn_attn = use_logn_attn
  54. self.use_flash_attn = use_flash_attn
  55. self.no_bias = no_bias
  56. super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)