QuarkML
/

QMoE-400

Text Generation

Mixture of Experts

sparse-mixture-of-experts

Model card Files Files and versions

QMoE-400 / configuration_qmoe.py

Sidharthan's picture

Upload folder using huggingface_hub

0ed2b3d verified 4 months ago

history blame contribute delete

722 Bytes

	from transformers import PretrainedConfig


	class QMoEConfig(PretrainedConfig):
	model_type = 'qmoe'

	def __init__(self, vocab_size=50257, d_model=768, num_layers=12, num_heads=16, max_seq_len=512, num_experts=8, moe_top_k=2, ffn_dim=2048, **kwargs):
	super().__init__(**kwargs)
	self.vocab_size = vocab_size
	self.d_model = d_model
	self.num_layers = num_layers
	self.num_heads = num_heads
	self.max_seq_len = max_seq_len
	self.num_experts = num_experts
	self.moe_top_k = moe_top_k
	self.ffn_dim = ffn_dim
	self.is_decoder = True
	self.add_cross_attention = False
	self.use_cache = False
	self.tie_word_embeddings = False