| from transformers import PretrainedConfig | |
| class QMoEConfig(PretrainedConfig): | |
| model_type = 'qmoe' | |
| def __init__(self, vocab_size=50257, d_model=768, num_layers=12, num_heads=16, max_seq_len=512, num_experts=8, moe_top_k=2, ffn_dim=2048, **kwargs): | |
| super().__init__(**kwargs) | |
| self.vocab_size = vocab_size | |
| self.d_model = d_model | |
| self.num_layers = num_layers | |
| self.num_heads = num_heads | |
| self.max_seq_len = max_seq_len | |
| self.num_experts = num_experts | |
| self.moe_top_k = moe_top_k | |
| self.ffn_dim = ffn_dim | |
| self.is_decoder = True | |
| self.add_cross_attention = False | |
| self.use_cache = False | |
| self.tie_word_embeddings = False | |