Spaces:
Running
Prepare training stack for RunPod: env-aware notebook + bootstrap script
Browse filesCell 3 now auto-detects Kaggle (/kaggle/working), Colab (/content),
RunPod (/workspace), or local — and adjusts defaults per environment.
On RunPod the base model defaults to whisper-large-v3-turbo (128 mel
bins, matches the Space and fits in 24 GB VRAM); on Kaggle T4 it stays
on whisper-small to avoid OOM. Batch size / grad accumulation tuned
per card (8x4 on 24 GB vs 16x2 on T4, both yield effective batch 32).
TRAIN_LANG, HF_USERNAME, and WHISPER_MODEL_ID are now env-var
overridable so the notebook can be driven headlessly from a shell.
scripts/runpod_setup.sh bootstraps a fresh pod:
- clones the Space repo into /workspace/sahel-voice
- installs training deps (peft, evaluate, tensorboard, etc.) that are
intentionally absent from the HF Spaces runtime requirements.txt
- sources .env for HF_TOKEN; falls back to prompting the user
- creates persistent checkpoint dirs on the Volume disk so adapters
survive pod stop/start
- prints GPU / VRAM sanity check at the end
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- notebooks/kaggle_master_trainer.ipynb +1 -1
- scripts/runpod_setup.sh +112 -0
|
@@ -62,7 +62,7 @@
|
|
| 62 |
"metadata": {},
|
| 63 |
"outputs": [],
|
| 64 |
"source": [
|
| 65 |
-
"# ── Cell 3: CONFIGURATION — edit these before each run ───────────────────────\nimport os\n\n# ─── Language to train ───────────────────────────────────────────────────────\n# 'bam' = Bambara 'ful' = Fula\nTRAIN_LANG = '
|
| 66 |
]
|
| 67 |
},
|
| 68 |
{
|
|
|
|
| 62 |
"metadata": {},
|
| 63 |
"outputs": [],
|
| 64 |
"source": [
|
| 65 |
+
"# ── Cell 3: CONFIGURATION — edit these before each run ───────────────────────\nimport os\nfrom pathlib import Path\n\n# ─── Environment detection (Kaggle / Colab / RunPod / local) ─────────────────\nif Path('/kaggle/working').exists():\n _ENV = 'kaggle'\n WORKING_DIR = '/kaggle/working'\nelif Path('/content').exists() and not Path('/workspace').exists():\n _ENV = 'colab'\n WORKING_DIR = '/content'\nelif Path('/workspace').exists():\n _ENV = 'runpod'\n WORKING_DIR = '/workspace'\nelse:\n _ENV = 'local'\n WORKING_DIR = os.environ.get('WORKING_DIR', os.path.expanduser('~/sahel-voice-work'))\n Path(WORKING_DIR).mkdir(parents=True, exist_ok=True)\n\n# ─── Language to train ───────────────────────────────────────────────────────\n# 'bam' = Bambara 'ful' = Fula\nTRAIN_LANG = os.environ.get('TRAIN_LANG', 'ful')\n\n# ─── Model ───────────────────────────────────────────────────────────────────\n# whisper-large-v3-turbo (128 mel bins) matches the Space base model.\n# On T4 (Kaggle, 16 GB) drop to 'openai/whisper-small' — turbo is tight there.\n_DEFAULT_MODEL = 'openai/whisper-small' if _ENV == 'kaggle' else 'openai/whisper-large-v3-turbo'\nWHISPER_MODEL_ID = os.environ.get('WHISPER_MODEL_ID', _DEFAULT_MODEL)\nTARGET_SR = 16_000\n\n# ─── HuggingFace repos ───────────────────────────────────────────────────────\nHF_USERNAME = os.environ.get('HF_USERNAME', 'ous-sow')\nFEEDBACK_REPO_ID = f'{HF_USERNAME}/sahel-agri-feedback'\nADAPTER_REPO_ID = f'{HF_USERNAME}/sahel-agri-adapters'\n\n# ─── Training hyper-parameters (defaults tuned per environment) ──────────────\nif _ENV == 'runpod':\n # 24 GB RTX 3090 / A40: turbo-v3 fits with batch 8, grad_accum 4 (eff. 32)\n MAX_STEPS = 2_000\n BATCH_SIZE = 8\n GRAD_ACCUM = 4\n MAX_WAXAL_TRAIN = 5_000\nelse:\n # T4 (Kaggle free) / local CPU fallback\n MAX_STEPS = 4_000\n BATCH_SIZE = 16\n GRAD_ACCUM = 2\n MAX_WAXAL_TRAIN = 5_000\n\nLEARNING_RATE = 1e-3\nWARMUP_STEPS = max(100, MAX_STEPS // 20)\nSAVE_STEPS = 500\nEVAL_STEPS = 500\nLOGGING_STEPS = 50\nCORRECTION_REPEAT= 3 # upsample user corrections Nx for emphasis\n\n# ─── Paths ────────────────────────────────────────────────────────────────────\nOUTPUT_DIR = f'{WORKING_DIR}/adapter_{TRAIN_LANG}'\nDATA_DIR = f'{WORKING_DIR}/data'\nAUDIO_DIR = f'{WORKING_DIR}/audio_feedback'\n\nLANG_NAME = {'bam': 'bambara', 'ful': 'fula'}.get(TRAIN_LANG, TRAIN_LANG)\nLANG_COUNTRY = {'bam': 'Mali', 'ful': 'Guinea'}.get(TRAIN_LANG, '')\nLANG_DIALECT = {\n 'bam': 'Standard Bambara (Bamako/Ségou) — Malian orthography',\n 'ful': 'Pular (Labé/Mamou dialects) — Guinean orthography',\n}.get(TRAIN_LANG, '')\n\nprint(f'Environment : {_ENV}')\nprint(f'Language : {TRAIN_LANG} ({LANG_NAME}) — {LANG_COUNTRY}')\nprint(f'Dialect : {LANG_DIALECT}')\nprint(f'Model : {WHISPER_MODEL_ID}')\nprint(f'Working dir : {WORKING_DIR}')\nprint(f'Output : {OUTPUT_DIR}')\nprint(f'Max steps : {MAX_STEPS} (batch={BATCH_SIZE}, grad_accum={GRAD_ACCUM}, eff={BATCH_SIZE*GRAD_ACCUM})')\n"
|
| 66 |
]
|
| 67 |
},
|
| 68 |
{
|
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# -----------------------------------------------------------------------------
|
| 3 |
+
# Sahel-Voice-Core — RunPod bootstrap
|
| 4 |
+
# -----------------------------------------------------------------------------
|
| 5 |
+
# Run this once inside a fresh RunPod pod (PyTorch 2.2 + CUDA 12.1 template).
|
| 6 |
+
# It clones the repo into /workspace, installs training-only dependencies
|
| 7 |
+
# (the HF Space requirements.txt is runtime-only), and prepares secrets.
|
| 8 |
+
#
|
| 9 |
+
# Usage inside the pod's Jupyter terminal:
|
| 10 |
+
#
|
| 11 |
+
# curl -fsSL https://huggingface.co/spaces/ous-sow/sahel-agri-voice/resolve/main/scripts/runpod_setup.sh -o setup.sh
|
| 12 |
+
# bash setup.sh
|
| 13 |
+
#
|
| 14 |
+
# Or, if you've already cloned the repo:
|
| 15 |
+
#
|
| 16 |
+
# bash /workspace/sahel-voice/scripts/runpod_setup.sh
|
| 17 |
+
#
|
| 18 |
+
# After setup, export HF_TOKEN and open notebooks/kaggle_master_trainer.ipynb.
|
| 19 |
+
# Cell 3 auto-detects the RunPod environment; no path edits needed.
|
| 20 |
+
# -----------------------------------------------------------------------------
|
| 21 |
+
set -euo pipefail
|
| 22 |
+
|
| 23 |
+
REPO_URL="${REPO_URL:-https://huggingface.co/spaces/ous-sow/sahel-agri-voice}"
|
| 24 |
+
WORKSPACE="${WORKSPACE:-/workspace}"
|
| 25 |
+
REPO_DIR="${REPO_DIR:-${WORKSPACE}/sahel-voice}"
|
| 26 |
+
|
| 27 |
+
echo "=============================================="
|
| 28 |
+
echo " Sahel-Voice-Core — RunPod setup"
|
| 29 |
+
echo "=============================================="
|
| 30 |
+
echo " Workspace : ${WORKSPACE}"
|
| 31 |
+
echo " Repo : ${REPO_DIR}"
|
| 32 |
+
echo "=============================================="
|
| 33 |
+
|
| 34 |
+
# 1. Clone (idempotent)
|
| 35 |
+
if [[ ! -d "${REPO_DIR}/.git" ]]; then
|
| 36 |
+
echo ">> Cloning repo..."
|
| 37 |
+
git clone "${REPO_URL}" "${REPO_DIR}"
|
| 38 |
+
else
|
| 39 |
+
echo ">> Repo already present — pulling latest."
|
| 40 |
+
git -C "${REPO_DIR}" pull --ff-only || true
|
| 41 |
+
fi
|
| 42 |
+
|
| 43 |
+
cd "${REPO_DIR}"
|
| 44 |
+
|
| 45 |
+
# 2. Training dependencies (not in requirements.txt which is runtime-only)
|
| 46 |
+
echo ">> Installing training dependencies..."
|
| 47 |
+
pip install -q --upgrade pip
|
| 48 |
+
pip install -q \
|
| 49 |
+
"transformers==5.5.0" \
|
| 50 |
+
"datasets==4.8.4" \
|
| 51 |
+
"accelerate==1.13.0" \
|
| 52 |
+
"huggingface-hub==1.9.0" \
|
| 53 |
+
"peft>=0.13.0" \
|
| 54 |
+
"evaluate>=0.4.1" \
|
| 55 |
+
"jiwer==3.0.4" \
|
| 56 |
+
"librosa==0.10.2" \
|
| 57 |
+
"soundfile==0.12.1" \
|
| 58 |
+
"tensorboard>=2.14" \
|
| 59 |
+
"pypdf>=4.0.0" \
|
| 60 |
+
"python-docx>=1.1.0"
|
| 61 |
+
|
| 62 |
+
# 3. HF token prompt (one-time)
|
| 63 |
+
ENV_FILE="${REPO_DIR}/.env"
|
| 64 |
+
if [[ -z "${HF_TOKEN:-}" ]] && [[ ! -f "${ENV_FILE}" ]]; then
|
| 65 |
+
echo ""
|
| 66 |
+
echo "=============================================="
|
| 67 |
+
echo " HF_TOKEN not set."
|
| 68 |
+
echo " Get a write-scoped token from"
|
| 69 |
+
echo " https://huggingface.co/settings/tokens"
|
| 70 |
+
echo " Then either:"
|
| 71 |
+
echo " export HF_TOKEN=hf_xxxxxxxx"
|
| 72 |
+
echo " or add it to ${ENV_FILE}:"
|
| 73 |
+
echo " echo 'HF_TOKEN=hf_xxxxxxxx' > ${ENV_FILE}"
|
| 74 |
+
echo "=============================================="
|
| 75 |
+
elif [[ -f "${ENV_FILE}" ]]; then
|
| 76 |
+
# Source it so this shell has HF_TOKEN available for downstream commands
|
| 77 |
+
set -a
|
| 78 |
+
# shellcheck disable=SC1090
|
| 79 |
+
source "${ENV_FILE}"
|
| 80 |
+
set +a
|
| 81 |
+
echo ">> Loaded env vars from ${ENV_FILE}"
|
| 82 |
+
fi
|
| 83 |
+
|
| 84 |
+
# 4. Persistent output dir for checkpoints (survives pod stop via Volume disk)
|
| 85 |
+
mkdir -p "${WORKSPACE}/adapter_bam" "${WORKSPACE}/adapter_ful" \
|
| 86 |
+
"${WORKSPACE}/data" "${WORKSPACE}/audio_feedback"
|
| 87 |
+
|
| 88 |
+
# 5. GPU sanity check
|
| 89 |
+
python - <<'PY'
|
| 90 |
+
import torch
|
| 91 |
+
print("=" * 46)
|
| 92 |
+
print(f" PyTorch : {torch.__version__}")
|
| 93 |
+
print(f" CUDA available : {torch.cuda.is_available()}")
|
| 94 |
+
if torch.cuda.is_available():
|
| 95 |
+
p = torch.cuda.get_device_properties(0)
|
| 96 |
+
print(f" GPU : {p.name}")
|
| 97 |
+
print(f" VRAM : {p.total_memory/1e9:.1f} GB")
|
| 98 |
+
print(f" Compute cap : {p.major}.{p.minor}")
|
| 99 |
+
print("=" * 46)
|
| 100 |
+
PY
|
| 101 |
+
|
| 102 |
+
echo ""
|
| 103 |
+
echo "✅ Setup complete."
|
| 104 |
+
echo ""
|
| 105 |
+
echo "Next steps:"
|
| 106 |
+
echo " 1. Open Jupyter Lab (port 8888 on the pod)"
|
| 107 |
+
echo " 2. Navigate to: ${REPO_DIR}/notebooks/kaggle_master_trainer.ipynb"
|
| 108 |
+
echo " 3. Set TRAIN_LANG in Cell 3 (or export TRAIN_LANG=ful before launching)"
|
| 109 |
+
echo " 4. Run All Cells — Cell 3 auto-detects /workspace and uses RunPod defaults"
|
| 110 |
+
echo ""
|
| 111 |
+
echo "Checkpoints will be saved to: ${WORKSPACE}/adapter_\$TRAIN_LANG"
|
| 112 |
+
echo "This path is on the Volume disk — survives pod stop/restart."
|