| ## Production Distillation Config for RunPod (RTX A6000 48GB) | |
| ## | |
| ## 3-stage MambaInLlama pipeline with research-validated hyperparameters: | |
| ## - SSM 10x LR boost (NB11) | |
| ## - T=2.0 for KL distillation (NB09) | |
| ## - alpha=0.7 for KL/CE balance (NB09) | |
| ## - SVD split for MoE diversity (NB03) | |
| ## | |
| ## Teacher: CohereLabs/tiny-aya-global (3.35B, validated in NB01-NB11) | |
| ## Student: Aetheris HybridMambaMoE (~800M params) | |
| ## Data: ClimbMix (Stage 1-2), multilingual chat (Stage 3) | |
| ## | |
| ## Wayy Research, 2024-2026 | |
| # Teacher model (tiny-aya-global: 3.35B, 70+ langs, no gated access required) | |
| teacher: | |
| name: "CohereLabs/tiny-aya-global" | |
| dtype: "bfloat16" | |
| device_map: "auto" | |
| # Student model | |
| student: | |
| config_path: "configs/student.yaml" | |
| dtype: "bfloat16" | |
| checkpoint: null # Set to Stage 1 checkpoint for Stage 2, etc. | |
| # Languages (10 core for multilingual equity tracking) | |
| languages: [en, es, hi, zh, ar, sw, tr, ja, id, te] | |
| # Seed | |
| seed: 42 | |
| # --- Stage 0: Block Conversion --- | |
| conversion: | |
| strategy: "weight_map" | |
| a_init: "exponential_decay" | |
| delta_init: "uniform" | |
| ffn_to_moe: "svd_split" # Best diversity (CKA=0.097 vs replicate=0.88) | |
| # --- Stage 1: Layer Alignment --- | |
| stage1: | |
| enabled: true | |
| total_steps: 10000 | |
| lr: 1.0e-4 | |
| warmup_steps: 500 | |
| batch_size: 4 | |
| gradient_accumulation: 8 | |
| gradient_checkpointing: true | |
| max_seq_len: 512 | |
| loss_type: "mse+cosine" | |
| cka_threshold: 0.75 | |
| cka_check_every: 500 | |
| save_every: 1000 | |
| log_every: 50 | |
| output_dir: "checkpoints/stage1_alignment" | |
| # --- Stage 2: KL Distillation --- | |
| stage2: | |
| enabled: true | |
| total_steps: 20000 | |
| lr: 5.0e-5 # Base LR | |
| ssm_lr_multiplier: 10.0 # SSM blocks get 10x (NB11: KL -26%, agreement +12x) | |
| warmup_steps: 500 | |
| batch_size: 4 | |
| gradient_accumulation: 8 | |
| gradient_checkpointing: true | |
| max_seq_len: 512 | |
| temperature: 2.0 # NB09: T=2.0 good balance | |
| alpha: 0.7 # NB09: alpha=0.7 | |
| save_every: 2000 | |
| log_every: 50 | |
| output_dir: "checkpoints/stage2_kl" | |
| # --- Stage 3: SFT --- | |
| stage3: | |
| enabled: true | |
| total_steps: 5000 | |
| lr: 2.0e-5 | |
| warmup_steps: 200 | |
| batch_size: 4 | |
| gradient_accumulation: 4 | |
| gradient_checkpointing: true | |
| max_seq_len: 1024 | |
| save_every: 500 | |
| log_every: 25 | |
| output_dir: "checkpoints/stage3_sft" | |
| # --- Data --- | |
| data: | |
| # Stage 1 & 2: ClimbMix (retokenized with Aya vocab) | |
| climbmix: | |
| dataset: "nvidia/ClimbMix" | |
| mode: "retokenize" | |
| streaming: true | |
| buffer_size: 500 | |
| min_tokens: 32 | |
| # Stage 3: Multilingual chat data (aya_collection is non-gated) | |
| sft: | |
| dataset_name: "CohereForAI/aya_collection" | |
| streaming: true | |
| # --- Evaluation --- | |
| eval: | |
| max_new_tokens: 128 | |
| temperature: 0.7 | |
| top_p: 0.9 | |
| output_dir: "results/runpod" | |