aetheris / training_config.yaml
rcgalbo's picture
Stage 1 initial: step 1000, loss=0.29, cka=0.60
ae5b361 verified
## Production Distillation Config for RunPod (RTX A6000 48GB)
##
## 3-stage MambaInLlama pipeline with research-validated hyperparameters:
## - SSM 10x LR boost (NB11)
## - T=2.0 for KL distillation (NB09)
## - alpha=0.7 for KL/CE balance (NB09)
## - SVD split for MoE diversity (NB03)
##
## Teacher: CohereLabs/tiny-aya-global (3.35B, validated in NB01-NB11)
## Student: Aetheris HybridMambaMoE (~800M params)
## Data: ClimbMix (Stage 1-2), multilingual chat (Stage 3)
##
## Wayy Research, 2024-2026
# Teacher model (tiny-aya-global: 3.35B, 70+ langs, no gated access required)
teacher:
name: "CohereLabs/tiny-aya-global"
dtype: "bfloat16"
device_map: "auto"
# Student model
student:
config_path: "configs/student.yaml"
dtype: "bfloat16"
checkpoint: null # Set to Stage 1 checkpoint for Stage 2, etc.
# Languages (10 core for multilingual equity tracking)
languages: [en, es, hi, zh, ar, sw, tr, ja, id, te]
# Seed
seed: 42
# --- Stage 0: Block Conversion ---
conversion:
strategy: "weight_map"
a_init: "exponential_decay"
delta_init: "uniform"
ffn_to_moe: "svd_split" # Best diversity (CKA=0.097 vs replicate=0.88)
# --- Stage 1: Layer Alignment ---
stage1:
enabled: true
total_steps: 10000
lr: 1.0e-4
warmup_steps: 500
batch_size: 4
gradient_accumulation: 8
gradient_checkpointing: true
max_seq_len: 512
loss_type: "mse+cosine"
cka_threshold: 0.75
cka_check_every: 500
save_every: 1000
log_every: 50
output_dir: "checkpoints/stage1_alignment"
# --- Stage 2: KL Distillation ---
stage2:
enabled: true
total_steps: 20000
lr: 5.0e-5 # Base LR
ssm_lr_multiplier: 10.0 # SSM blocks get 10x (NB11: KL -26%, agreement +12x)
warmup_steps: 500
batch_size: 4
gradient_accumulation: 8
gradient_checkpointing: true
max_seq_len: 512
temperature: 2.0 # NB09: T=2.0 good balance
alpha: 0.7 # NB09: alpha=0.7
save_every: 2000
log_every: 50
output_dir: "checkpoints/stage2_kl"
# --- Stage 3: SFT ---
stage3:
enabled: true
total_steps: 5000
lr: 2.0e-5
warmup_steps: 200
batch_size: 4
gradient_accumulation: 4
gradient_checkpointing: true
max_seq_len: 1024
save_every: 500
log_every: 25
output_dir: "checkpoints/stage3_sft"
# --- Data ---
data:
# Stage 1 & 2: ClimbMix (retokenized with Aya vocab)
climbmix:
dataset: "nvidia/ClimbMix"
mode: "retokenize"
streaming: true
buffer_size: 500
min_tokens: 32
# Stage 3: Multilingual chat data (aya_collection is non-gated)
sft:
dataset_name: "CohereForAI/aya_collection"
streaming: true
# --- Evaluation ---
eval:
max_new_tokens: 128
temperature: 0.7
top_p: 0.9
output_dir: "results/runpod"