123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263 |
- from typing import List, Optional
- from transformers import PretrainedConfig
- class MLPSpeculatorConfig(PretrainedConfig):
- model_type = "mlp_speculator"
- attribute_map = {
- "hidden_size": "emb_dim",
- }
- def __init__(self,
- vocab_size: int = 32000,
- emb_dim: int = 4096,
- inner_dim: int = 0,
- n_predict: int = 3,
- top_k_tokens_per_head: Optional[List[int]] = None,
- n_candidates: int = 5,
- tie_weights: bool = False,
- scale_input: bool = False,
- **kwargs):
- """
- Initialize an MLPSpeculatorConfig
- Args:
- vocab_size: int
- the model vocab size
- emb_dim: int
- the model embedding dimension
- inner_dim: int
- the inner dimension of the model. If 0, will be the emb_dim.
- n_predict: int
- the number of lookaheads for the speculator
- top_k_tokens_per_head: List[int]
- Number of tokens to consider from each head when forming the
- candidate tree.
- For each candidate branch in the tree, head n produces topk[n]
- additional sub-branches.
- NOTE: This parameter is currently unused.
- n_candidates: int
- number of child candidates to create per sequence
- tie_weights: bool
- If true, use a single set of weights for every model
- head/stage after the first. The initial projection
- from the base model may have a different size, so that
- stays separate.
- scale_input: bool
- if True, will scale the initial hidden states from
- the base model.
- """
- if top_k_tokens_per_head is None:
- top_k_tokens_per_head = [5, 4, 3]
- assert len(top_k_tokens_per_head) == n_predict
- self.vocab_size = vocab_size
- self.emb_dim = emb_dim
- self.inner_dim = inner_dim
- self.n_predict = n_predict
- self.top_k_tokens_per_head = top_k_tokens_per_head
- self.n_candidates = n_candidates
- self.num_lookahead_tokens = n_predict
- self.tie_weights = tie_weights
- self.scale_input = scale_input
- super().__init__(**kwargs)
|