Spaces:

MataStrategy
/

ground-zero

Running

jefffffff9 Claude Sonnet 4.6 commited on 17 days ago

Commit

9049ef3

1 Parent(s): cc50efb

Prepare training stack for RunPod: env-aware notebook + bootstrap script

Cell 3 now auto-detects Kaggle (/kaggle/working), Colab (/content),
RunPod (/workspace), or local — and adjusts defaults per environment.
On RunPod the base model defaults to whisper-large-v3-turbo (128 mel
bins, matches the Space and fits in 24 GB VRAM); on Kaggle T4 it stays
on whisper-small to avoid OOM. Batch size / grad accumulation tuned
per card (8x4 on 24 GB vs 16x2 on T4, both yield effective batch 32).

TRAIN_LANG, HF_USERNAME, and WHISPER_MODEL_ID are now env-var
overridable so the notebook can be driven headlessly from a shell.

scripts/runpod_setup.sh bootstraps a fresh pod:
- clones the Space repo into /workspace/sahel-voice
- installs training deps (peft, evaluate, tensorboard, etc.) that are
intentionally absent from the HF Spaces runtime requirements.txt
- sources .env for HF_TOKEN; falls back to prompting the user
- creates persistent checkpoint dirs on the Volume disk so adapters
survive pod stop/start
- prints GPU / VRAM sanity check at the end

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (2) hide show

notebooks/kaggle_master_trainer.ipynb +1 -1
scripts/runpod_setup.sh +112 -0

notebooks/kaggle_master_trainer.ipynb CHANGED Viewed

@@ -62,7 +62,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# ── Cell 3: CONFIGURATION — edit these before each run ───────────────────────\nimport os\n\n# ─── Language to train ───────────────────────────────────────────────────────\n#  'bam' = Bambara    'ful' = Fula\nTRAIN_LANG       = 'bam'\n\n# ─── Model ───────────────────────────────────────────────────────────────────\nWHISPER_MODEL_ID = 'openai/whisper-small'\nTARGET_SR        = 16_000\n\n# ─── HuggingFace repos ───────────────────────────────────────────────────────\nHF_USERNAME      = 'ous-sow'\nFEEDBACK_REPO_ID = f'{HF_USERNAME}/sahel-agri-feedback'\nADAPTER_REPO_ID  = f'{HF_USERNAME}/sahel-agri-adapters'\n\n# ─── Training hyper-parameters ───────────────────────────────────────────────\nMAX_STEPS        = 4_000     # T4 ~45 min; set 8000 for a deeper run\nBATCH_SIZE       = 16\nGRAD_ACCUM       = 2         # effective batch = 32\nLEARNING_RATE    = 1e-3\nWARMUP_STEPS     = 200\nSAVE_STEPS       = 500\nEVAL_STEPS       = 500\nLOGGING_STEPS    = 50\nMAX_WAXAL_TRAIN  = 5_000    # cap WaxalNLP samples (streaming budget)\nCORRECTION_REPEAT= 3        # upsample user corrections Nx for emphasis\n\n# ─── Paths (Kaggle working dir) ───────────────────────────────────────────────\nWORKING_DIR      = '/kaggle/working'\nOUTPUT_DIR       = f'{WORKING_DIR}/adapter_{TRAIN_LANG}'\nDATA_DIR         = f'{WORKING_DIR}/data'\nAUDIO_DIR        = f'{WORKING_DIR}/audio_feedback'\n\nLANG_NAME        = {'bam': 'bambara', 'ful': 'fula'}.get(TRAIN_LANG, TRAIN_LANG)\nLANG_COUNTRY     = {'bam': 'Mali',    'ful': 'Guinea'}.get(TRAIN_LANG, '')\nLANG_DIALECT     = {\n    'bam': 'Standard Bambara (Bamako/Ségou) — Malian orthography',\n    'ful': 'Pular (Labé/Mamou dialects) — Guinean orthography',\n}.get(TRAIN_LANG, '')\n\nprint(f'Language  : {TRAIN_LANG} ({LANG_NAME}) — {LANG_COUNTRY}')\nprint(f'Dialect   : {LANG_DIALECT}')\nprint(f'Model     : {WHISPER_MODEL_ID}')\nprint(f'Output    : {OUTPUT_DIR}')\nprint(f'Max steps : {MAX_STEPS}')"
    ]
   },
   {

    "metadata": {},
    "outputs": [],
    "source": [
+    "# ── Cell 3: CONFIGURATION — edit these before each run ───────────────────────\nimport os\nfrom pathlib import Path\n\n# ─── Environment detection (Kaggle / Colab / RunPod / local) ─────────────────\nif Path('/kaggle/working').exists():\n    _ENV = 'kaggle'\n    WORKING_DIR = '/kaggle/working'\nelif Path('/content').exists() and not Path('/workspace').exists():\n    _ENV = 'colab'\n    WORKING_DIR = '/content'\nelif Path('/workspace').exists():\n    _ENV = 'runpod'\n    WORKING_DIR = '/workspace'\nelse:\n    _ENV = 'local'\n    WORKING_DIR = os.environ.get('WORKING_DIR', os.path.expanduser('~/sahel-voice-work'))\n    Path(WORKING_DIR).mkdir(parents=True, exist_ok=True)\n\n# ─── Language to train ───────────────────────────────────────────────────────\n#  'bam' = Bambara    'ful' = Fula\nTRAIN_LANG       = os.environ.get('TRAIN_LANG', 'ful')\n\n# ─── Model ───────────────────────────────────────────────────────────────────\n# whisper-large-v3-turbo (128 mel bins) matches the Space base model.\n# On T4 (Kaggle, 16 GB) drop to 'openai/whisper-small' — turbo is tight there.\n_DEFAULT_MODEL = 'openai/whisper-small' if _ENV == 'kaggle' else 'openai/whisper-large-v3-turbo'\nWHISPER_MODEL_ID = os.environ.get('WHISPER_MODEL_ID', _DEFAULT_MODEL)\nTARGET_SR        = 16_000\n\n# ─── HuggingFace repos ───────────────────────────────────────────────────────\nHF_USERNAME      = os.environ.get('HF_USERNAME', 'ous-sow')\nFEEDBACK_REPO_ID = f'{HF_USERNAME}/sahel-agri-feedback'\nADAPTER_REPO_ID  = f'{HF_USERNAME}/sahel-agri-adapters'\n\n# ─── Training hyper-parameters (defaults tuned per environment) ──────────────\nif _ENV == 'runpod':\n    # 24 GB RTX 3090 / A40: turbo-v3 fits with batch 8, grad_accum 4 (eff. 32)\n    MAX_STEPS        = 2_000\n    BATCH_SIZE       = 8\n    GRAD_ACCUM       = 4\n    MAX_WAXAL_TRAIN  = 5_000\nelse:\n    # T4 (Kaggle free) / local CPU fallback\n    MAX_STEPS        = 4_000\n    BATCH_SIZE       = 16\n    GRAD_ACCUM       = 2\n    MAX_WAXAL_TRAIN  = 5_000\n\nLEARNING_RATE    = 1e-3\nWARMUP_STEPS     = max(100, MAX_STEPS // 20)\nSAVE_STEPS       = 500\nEVAL_STEPS       = 500\nLOGGING_STEPS    = 50\nCORRECTION_REPEAT= 3        # upsample user corrections Nx for emphasis\n\n# ─── Paths ────────────────────────────────────────────────────────────────────\nOUTPUT_DIR       = f'{WORKING_DIR}/adapter_{TRAIN_LANG}'\nDATA_DIR         = f'{WORKING_DIR}/data'\nAUDIO_DIR        = f'{WORKING_DIR}/audio_feedback'\n\nLANG_NAME        = {'bam': 'bambara', 'ful': 'fula'}.get(TRAIN_LANG, TRAIN_LANG)\nLANG_COUNTRY     = {'bam': 'Mali',    'ful': 'Guinea'}.get(TRAIN_LANG, '')\nLANG_DIALECT     = {\n    'bam': 'Standard Bambara (Bamako/Ségou) — Malian orthography',\n    'ful': 'Pular (Labé/Mamou dialects) — Guinean orthography',\n}.get(TRAIN_LANG, '')\n\nprint(f'Environment : {_ENV}')\nprint(f'Language    : {TRAIN_LANG} ({LANG_NAME}) — {LANG_COUNTRY}')\nprint(f'Dialect     : {LANG_DIALECT}')\nprint(f'Model       : {WHISPER_MODEL_ID}')\nprint(f'Working dir : {WORKING_DIR}')\nprint(f'Output      : {OUTPUT_DIR}')\nprint(f'Max steps   : {MAX_STEPS}  (batch={BATCH_SIZE}, grad_accum={GRAD_ACCUM}, eff={BATCH_SIZE*GRAD_ACCUM})')\n"
    ]
   },
   {

scripts/runpod_setup.sh ADDED Viewed

	@@ -0,0 +1,112 @@

+#!/usr/bin/env bash
+# -----------------------------------------------------------------------------
+# Sahel-Voice-Core — RunPod bootstrap
+# -----------------------------------------------------------------------------
+# Run this once inside a fresh RunPod pod (PyTorch 2.2 + CUDA 12.1 template).
+# It clones the repo into /workspace, installs training-only dependencies
+# (the HF Space requirements.txt is runtime-only), and prepares secrets.
+#
+# Usage inside the pod's Jupyter terminal:
+#
+#   curl -fsSL https://huggingface.co/spaces/ous-sow/sahel-agri-voice/resolve/main/scripts/runpod_setup.sh -o setup.sh
+#   bash setup.sh
+#
+# Or, if you've already cloned the repo:
+#
+#   bash /workspace/sahel-voice/scripts/runpod_setup.sh
+#
+# After setup, export HF_TOKEN and open notebooks/kaggle_master_trainer.ipynb.
+# Cell 3 auto-detects the RunPod environment; no path edits needed.
+# -----------------------------------------------------------------------------
+set -euo pipefail
+REPO_URL="${REPO_URL:-https://huggingface.co/spaces/ous-sow/sahel-agri-voice}"
+WORKSPACE="${WORKSPACE:-/workspace}"
+REPO_DIR="${REPO_DIR:-${WORKSPACE}/sahel-voice}"
+echo "=============================================="
+echo " Sahel-Voice-Core — RunPod setup"
+echo "=============================================="
+echo " Workspace : ${WORKSPACE}"
+echo " Repo      : ${REPO_DIR}"
+echo "=============================================="
+# 1. Clone (idempotent)
+if [[ ! -d "${REPO_DIR}/.git" ]]; then
+    echo ">> Cloning repo..."
+    git clone "${REPO_URL}" "${REPO_DIR}"
+else
+    echo ">> Repo already present — pulling latest."
+    git -C "${REPO_DIR}" pull --ff-only || true
+fi
+cd "${REPO_DIR}"
+# 2. Training dependencies (not in requirements.txt which is runtime-only)
+echo ">> Installing training dependencies..."
+pip install -q --upgrade pip
+pip install -q \
+    "transformers==5.5.0" \
+    "datasets==4.8.4" \
+    "accelerate==1.13.0" \
+    "huggingface-hub==1.9.0" \
+    "peft>=0.13.0" \
+    "evaluate>=0.4.1" \
+    "jiwer==3.0.4" \
+    "librosa==0.10.2" \
+    "soundfile==0.12.1" \
+    "tensorboard>=2.14" \
+    "pypdf>=4.0.0" \
+    "python-docx>=1.1.0"
+# 3. HF token prompt (one-time)
+ENV_FILE="${REPO_DIR}/.env"
+if [[ -z "${HF_TOKEN:-}" ]] && [[ ! -f "${ENV_FILE}" ]]; then
+    echo ""
+    echo "=============================================="
+    echo " HF_TOKEN not set."
+    echo " Get a write-scoped token from"
+    echo "   https://huggingface.co/settings/tokens"
+    echo " Then either:"
+    echo "   export HF_TOKEN=hf_xxxxxxxx"
+    echo " or add it to ${ENV_FILE}:"
+    echo "   echo 'HF_TOKEN=hf_xxxxxxxx' > ${ENV_FILE}"
+    echo "=============================================="
+elif [[ -f "${ENV_FILE}" ]]; then
+    # Source it so this shell has HF_TOKEN available for downstream commands
+    set -a
+    # shellcheck disable=SC1090
+    source "${ENV_FILE}"
+    set +a
+    echo ">> Loaded env vars from ${ENV_FILE}"
+fi
+# 4. Persistent output dir for checkpoints (survives pod stop via Volume disk)
+mkdir -p "${WORKSPACE}/adapter_bam" "${WORKSPACE}/adapter_ful" \
+         "${WORKSPACE}/data"        "${WORKSPACE}/audio_feedback"
+# 5. GPU sanity check
+python - <<'PY'
+import torch
+print("=" * 46)
+print(f" PyTorch        : {torch.__version__}")
+print(f" CUDA available : {torch.cuda.is_available()}")
+if torch.cuda.is_available():
+    p = torch.cuda.get_device_properties(0)
+    print(f" GPU            : {p.name}")
+    print(f" VRAM           : {p.total_memory/1e9:.1f} GB")
+    print(f" Compute cap    : {p.major}.{p.minor}")
+print("=" * 46)
+PY
+echo ""
+echo "✅ Setup complete."
+echo ""
+echo "Next steps:"
+echo "  1. Open Jupyter Lab (port 8888 on the pod)"
+echo "  2. Navigate to:  ${REPO_DIR}/notebooks/kaggle_master_trainer.ipynb"
+echo "  3. Set TRAIN_LANG in Cell 3 (or export TRAIN_LANG=ful before launching)"
+echo "  4. Run All Cells — Cell 3 auto-detects /workspace and uses RunPod defaults"
+echo ""
+echo "Checkpoints will be saved to: ${WORKSPACE}/adapter_\$TRAIN_LANG"
+echo "This path is on the Volume disk — survives pod stop/restart."