wayyresearch
/

aetheris

Text Generation

state-space-model

mixture-of-experts

Mixture of Experts

knowledge-distillation

hybrid-architecture

Model card Files Files and versions

aetheris / training_config.yaml

rcgalbo's picture

Stage 1 initial: step 1000, loss=0.29, cka=0.60

ae5b361 verified about 2 months ago

history blame contribute delete

2.74 kB

	## Production Distillation Config for RunPod (RTX A6000 48GB)
	##
	## 3-stage MambaInLlama pipeline with research-validated hyperparameters:
	## - SSM 10x LR boost (NB11)
	## - T=2.0 for KL distillation (NB09)
	## - alpha=0.7 for KL/CE balance (NB09)
	## - SVD split for MoE diversity (NB03)
	##
	## Teacher: CohereLabs/tiny-aya-global (3.35B, validated in NB01-NB11)
	## Student: Aetheris HybridMambaMoE (~800M params)
	## Data: ClimbMix (Stage 1-2), multilingual chat (Stage 3)
	##
	## Wayy Research, 2024-2026

	# Teacher model (tiny-aya-global: 3.35B, 70+ langs, no gated access required)
	teacher:
	name: "CohereLabs/tiny-aya-global"
	dtype: "bfloat16"
	device_map: "auto"

	# Student model
	student:
	config_path: "configs/student.yaml"
	dtype: "bfloat16"
	checkpoint: null # Set to Stage 1 checkpoint for Stage 2, etc.

	# Languages (10 core for multilingual equity tracking)
	languages: [en, es, hi, zh, ar, sw, tr, ja, id, te]

	# Seed
	seed: 42

	# --- Stage 0: Block Conversion ---
	conversion:
	strategy: "weight_map"
	a_init: "exponential_decay"
	delta_init: "uniform"
	ffn_to_moe: "svd_split" # Best diversity (CKA=0.097 vs replicate=0.88)

	# --- Stage 1: Layer Alignment ---
	stage1:
	enabled: true
	total_steps: 10000
	lr: 1.0e-4
	warmup_steps: 500
	batch_size: 4
	gradient_accumulation: 8
	gradient_checkpointing: true
	max_seq_len: 512
	loss_type: "mse+cosine"
	cka_threshold: 0.75
	cka_check_every: 500
	save_every: 1000
	log_every: 50
	output_dir: "checkpoints/stage1_alignment"

	# --- Stage 2: KL Distillation ---
	stage2:
	enabled: true
	total_steps: 20000
	lr: 5.0e-5 # Base LR
	ssm_lr_multiplier: 10.0 # SSM blocks get 10x (NB11: KL -26%, agreement +12x)
	warmup_steps: 500
	batch_size: 4
	gradient_accumulation: 8
	gradient_checkpointing: true
	max_seq_len: 512
	temperature: 2.0 # NB09: T=2.0 good balance
	alpha: 0.7 # NB09: alpha=0.7
	save_every: 2000
	log_every: 50
	output_dir: "checkpoints/stage2_kl"

	# --- Stage 3: SFT ---
	stage3:
	enabled: true
	total_steps: 5000
	lr: 2.0e-5
	warmup_steps: 200
	batch_size: 4
	gradient_accumulation: 4
	gradient_checkpointing: true
	max_seq_len: 1024
	save_every: 500
	log_every: 25
	output_dir: "checkpoints/stage3_sft"

	# --- Data ---
	data:
	# Stage 1 & 2: ClimbMix (retokenized with Aya vocab)
	climbmix:
	dataset: "nvidia/ClimbMix"
	mode: "retokenize"
	streaming: true
	buffer_size: 500
	min_tokens: 32

	# Stage 3: Multilingual chat data (aya_collection is non-gated)
	sft:
	dataset_name: "CohereForAI/aya_collection"
	streaming: true

	# --- Evaluation ---
	eval:
	max_new_tokens: 128
	temperature: 0.7
	top_p: 0.9
	output_dir: "results/runpod"