jefffffff9 Claude Sonnet 4.6 commited on
Commit
9049ef3
·
1 Parent(s): cc50efb

Prepare training stack for RunPod: env-aware notebook + bootstrap script

Browse files

Cell 3 now auto-detects Kaggle (/kaggle/working), Colab (/content),
RunPod (/workspace), or local — and adjusts defaults per environment.
On RunPod the base model defaults to whisper-large-v3-turbo (128 mel
bins, matches the Space and fits in 24 GB VRAM); on Kaggle T4 it stays
on whisper-small to avoid OOM. Batch size / grad accumulation tuned
per card (8x4 on 24 GB vs 16x2 on T4, both yield effective batch 32).

TRAIN_LANG, HF_USERNAME, and WHISPER_MODEL_ID are now env-var
overridable so the notebook can be driven headlessly from a shell.

scripts/runpod_setup.sh bootstraps a fresh pod:
- clones the Space repo into /workspace/sahel-voice
- installs training deps (peft, evaluate, tensorboard, etc.) that are
intentionally absent from the HF Spaces runtime requirements.txt
- sources .env for HF_TOKEN; falls back to prompting the user
- creates persistent checkpoint dirs on the Volume disk so adapters
survive pod stop/start
- prints GPU / VRAM sanity check at the end

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

notebooks/kaggle_master_trainer.ipynb CHANGED
@@ -62,7 +62,7 @@
62
  "metadata": {},
63
  "outputs": [],
64
  "source": [
65
- "# ── Cell 3: CONFIGURATION — edit these before each run ───────────────────────\nimport os\n\n# ─── Language to train ───────────────────────────────────────────────────────\n# 'bam' = Bambara 'ful' = Fula\nTRAIN_LANG = 'bam'\n\n# ─── Model ───────────────────────────────────────────────────────────────────\nWHISPER_MODEL_ID = 'openai/whisper-small'\nTARGET_SR = 16_000\n\n# ─── HuggingFace repos ───────────────────────────────────────────────────────\nHF_USERNAME = 'ous-sow'\nFEEDBACK_REPO_ID = f'{HF_USERNAME}/sahel-agri-feedback'\nADAPTER_REPO_ID = f'{HF_USERNAME}/sahel-agri-adapters'\n\n# ─── Training hyper-parameters ───────────────────────────────────────────────\nMAX_STEPS = 4_000 # T4 ~45 min; set 8000 for a deeper run\nBATCH_SIZE = 16\nGRAD_ACCUM = 2 # effective batch = 32\nLEARNING_RATE = 1e-3\nWARMUP_STEPS = 200\nSAVE_STEPS = 500\nEVAL_STEPS = 500\nLOGGING_STEPS = 50\nMAX_WAXAL_TRAIN = 5_000 # cap WaxalNLP samples (streaming budget)\nCORRECTION_REPEAT= 3 # upsample user corrections Nx for emphasis\n\n# ─── Paths (Kaggle working dir) ───────────────────────────────────────────────\nWORKING_DIR = '/kaggle/working'\nOUTPUT_DIR = f'{WORKING_DIR}/adapter_{TRAIN_LANG}'\nDATA_DIR = f'{WORKING_DIR}/data'\nAUDIO_DIR = f'{WORKING_DIR}/audio_feedback'\n\nLANG_NAME = {'bam': 'bambara', 'ful': 'fula'}.get(TRAIN_LANG, TRAIN_LANG)\nLANG_COUNTRY = {'bam': 'Mali', 'ful': 'Guinea'}.get(TRAIN_LANG, '')\nLANG_DIALECT = {\n 'bam': 'Standard Bambara (Bamako/Ségou) — Malian orthography',\n 'ful': 'Pular (Labé/Mamou dialects) — Guinean orthography',\n}.get(TRAIN_LANG, '')\n\nprint(f'Language : {TRAIN_LANG} ({LANG_NAME}) — {LANG_COUNTRY}')\nprint(f'Dialect : {LANG_DIALECT}')\nprint(f'Model : {WHISPER_MODEL_ID}')\nprint(f'Output : {OUTPUT_DIR}')\nprint(f'Max steps : {MAX_STEPS}')"
66
  ]
67
  },
68
  {
 
62
  "metadata": {},
63
  "outputs": [],
64
  "source": [
65
+ "# ── Cell 3: CONFIGURATION — edit these before each run ───────────────────────\nimport os\nfrom pathlib import Path\n\n# ─── Environment detection (Kaggle / Colab / RunPod / local) ─────────────────\nif Path('/kaggle/working').exists():\n _ENV = 'kaggle'\n WORKING_DIR = '/kaggle/working'\nelif Path('/content').exists() and not Path('/workspace').exists():\n _ENV = 'colab'\n WORKING_DIR = '/content'\nelif Path('/workspace').exists():\n _ENV = 'runpod'\n WORKING_DIR = '/workspace'\nelse:\n _ENV = 'local'\n WORKING_DIR = os.environ.get('WORKING_DIR', os.path.expanduser('~/sahel-voice-work'))\n Path(WORKING_DIR).mkdir(parents=True, exist_ok=True)\n\n# ─── Language to train ───────────────────────────────────────────────────────\n# 'bam' = Bambara 'ful' = Fula\nTRAIN_LANG = os.environ.get('TRAIN_LANG', 'ful')\n\n# ─── Model ───────────────────────────────────────────────────────────────────\n# whisper-large-v3-turbo (128 mel bins) matches the Space base model.\n# On T4 (Kaggle, 16 GB) drop to 'openai/whisper-small' — turbo is tight there.\n_DEFAULT_MODEL = 'openai/whisper-small' if _ENV == 'kaggle' else 'openai/whisper-large-v3-turbo'\nWHISPER_MODEL_ID = os.environ.get('WHISPER_MODEL_ID', _DEFAULT_MODEL)\nTARGET_SR = 16_000\n\n# ─── HuggingFace repos ───────────────────────────────────────────────────────\nHF_USERNAME = os.environ.get('HF_USERNAME', 'ous-sow')\nFEEDBACK_REPO_ID = f'{HF_USERNAME}/sahel-agri-feedback'\nADAPTER_REPO_ID = f'{HF_USERNAME}/sahel-agri-adapters'\n\n# ─── Training hyper-parameters (defaults tuned per environment) ──────────────\nif _ENV == 'runpod':\n # 24 GB RTX 3090 / A40: turbo-v3 fits with batch 8, grad_accum 4 (eff. 32)\n MAX_STEPS = 2_000\n BATCH_SIZE = 8\n GRAD_ACCUM = 4\n MAX_WAXAL_TRAIN = 5_000\nelse:\n # T4 (Kaggle free) / local CPU fallback\n MAX_STEPS = 4_000\n BATCH_SIZE = 16\n GRAD_ACCUM = 2\n MAX_WAXAL_TRAIN = 5_000\n\nLEARNING_RATE = 1e-3\nWARMUP_STEPS = max(100, MAX_STEPS // 20)\nSAVE_STEPS = 500\nEVAL_STEPS = 500\nLOGGING_STEPS = 50\nCORRECTION_REPEAT= 3 # upsample user corrections Nx for emphasis\n\n# ─── Paths ────────────────────────────────────────────────────────────────────\nOUTPUT_DIR = f'{WORKING_DIR}/adapter_{TRAIN_LANG}'\nDATA_DIR = f'{WORKING_DIR}/data'\nAUDIO_DIR = f'{WORKING_DIR}/audio_feedback'\n\nLANG_NAME = {'bam': 'bambara', 'ful': 'fula'}.get(TRAIN_LANG, TRAIN_LANG)\nLANG_COUNTRY = {'bam': 'Mali', 'ful': 'Guinea'}.get(TRAIN_LANG, '')\nLANG_DIALECT = {\n 'bam': 'Standard Bambara (Bamako/Ségou) — Malian orthography',\n 'ful': 'Pular (Labé/Mamou dialects) — Guinean orthography',\n}.get(TRAIN_LANG, '')\n\nprint(f'Environment : {_ENV}')\nprint(f'Language : {TRAIN_LANG} ({LANG_NAME}) — {LANG_COUNTRY}')\nprint(f'Dialect : {LANG_DIALECT}')\nprint(f'Model : {WHISPER_MODEL_ID}')\nprint(f'Working dir : {WORKING_DIR}')\nprint(f'Output : {OUTPUT_DIR}')\nprint(f'Max steps : {MAX_STEPS} (batch={BATCH_SIZE}, grad_accum={GRAD_ACCUM}, eff={BATCH_SIZE*GRAD_ACCUM})')\n"
66
  ]
67
  },
68
  {
scripts/runpod_setup.sh ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # -----------------------------------------------------------------------------
3
+ # Sahel-Voice-Core — RunPod bootstrap
4
+ # -----------------------------------------------------------------------------
5
+ # Run this once inside a fresh RunPod pod (PyTorch 2.2 + CUDA 12.1 template).
6
+ # It clones the repo into /workspace, installs training-only dependencies
7
+ # (the HF Space requirements.txt is runtime-only), and prepares secrets.
8
+ #
9
+ # Usage inside the pod's Jupyter terminal:
10
+ #
11
+ # curl -fsSL https://huggingface.co/spaces/ous-sow/sahel-agri-voice/resolve/main/scripts/runpod_setup.sh -o setup.sh
12
+ # bash setup.sh
13
+ #
14
+ # Or, if you've already cloned the repo:
15
+ #
16
+ # bash /workspace/sahel-voice/scripts/runpod_setup.sh
17
+ #
18
+ # After setup, export HF_TOKEN and open notebooks/kaggle_master_trainer.ipynb.
19
+ # Cell 3 auto-detects the RunPod environment; no path edits needed.
20
+ # -----------------------------------------------------------------------------
21
+ set -euo pipefail
22
+
23
+ REPO_URL="${REPO_URL:-https://huggingface.co/spaces/ous-sow/sahel-agri-voice}"
24
+ WORKSPACE="${WORKSPACE:-/workspace}"
25
+ REPO_DIR="${REPO_DIR:-${WORKSPACE}/sahel-voice}"
26
+
27
+ echo "=============================================="
28
+ echo " Sahel-Voice-Core — RunPod setup"
29
+ echo "=============================================="
30
+ echo " Workspace : ${WORKSPACE}"
31
+ echo " Repo : ${REPO_DIR}"
32
+ echo "=============================================="
33
+
34
+ # 1. Clone (idempotent)
35
+ if [[ ! -d "${REPO_DIR}/.git" ]]; then
36
+ echo ">> Cloning repo..."
37
+ git clone "${REPO_URL}" "${REPO_DIR}"
38
+ else
39
+ echo ">> Repo already present — pulling latest."
40
+ git -C "${REPO_DIR}" pull --ff-only || true
41
+ fi
42
+
43
+ cd "${REPO_DIR}"
44
+
45
+ # 2. Training dependencies (not in requirements.txt which is runtime-only)
46
+ echo ">> Installing training dependencies..."
47
+ pip install -q --upgrade pip
48
+ pip install -q \
49
+ "transformers==5.5.0" \
50
+ "datasets==4.8.4" \
51
+ "accelerate==1.13.0" \
52
+ "huggingface-hub==1.9.0" \
53
+ "peft>=0.13.0" \
54
+ "evaluate>=0.4.1" \
55
+ "jiwer==3.0.4" \
56
+ "librosa==0.10.2" \
57
+ "soundfile==0.12.1" \
58
+ "tensorboard>=2.14" \
59
+ "pypdf>=4.0.0" \
60
+ "python-docx>=1.1.0"
61
+
62
+ # 3. HF token prompt (one-time)
63
+ ENV_FILE="${REPO_DIR}/.env"
64
+ if [[ -z "${HF_TOKEN:-}" ]] && [[ ! -f "${ENV_FILE}" ]]; then
65
+ echo ""
66
+ echo "=============================================="
67
+ echo " HF_TOKEN not set."
68
+ echo " Get a write-scoped token from"
69
+ echo " https://huggingface.co/settings/tokens"
70
+ echo " Then either:"
71
+ echo " export HF_TOKEN=hf_xxxxxxxx"
72
+ echo " or add it to ${ENV_FILE}:"
73
+ echo " echo 'HF_TOKEN=hf_xxxxxxxx' > ${ENV_FILE}"
74
+ echo "=============================================="
75
+ elif [[ -f "${ENV_FILE}" ]]; then
76
+ # Source it so this shell has HF_TOKEN available for downstream commands
77
+ set -a
78
+ # shellcheck disable=SC1090
79
+ source "${ENV_FILE}"
80
+ set +a
81
+ echo ">> Loaded env vars from ${ENV_FILE}"
82
+ fi
83
+
84
+ # 4. Persistent output dir for checkpoints (survives pod stop via Volume disk)
85
+ mkdir -p "${WORKSPACE}/adapter_bam" "${WORKSPACE}/adapter_ful" \
86
+ "${WORKSPACE}/data" "${WORKSPACE}/audio_feedback"
87
+
88
+ # 5. GPU sanity check
89
+ python - <<'PY'
90
+ import torch
91
+ print("=" * 46)
92
+ print(f" PyTorch : {torch.__version__}")
93
+ print(f" CUDA available : {torch.cuda.is_available()}")
94
+ if torch.cuda.is_available():
95
+ p = torch.cuda.get_device_properties(0)
96
+ print(f" GPU : {p.name}")
97
+ print(f" VRAM : {p.total_memory/1e9:.1f} GB")
98
+ print(f" Compute cap : {p.major}.{p.minor}")
99
+ print("=" * 46)
100
+ PY
101
+
102
+ echo ""
103
+ echo "✅ Setup complete."
104
+ echo ""
105
+ echo "Next steps:"
106
+ echo " 1. Open Jupyter Lab (port 8888 on the pod)"
107
+ echo " 2. Navigate to: ${REPO_DIR}/notebooks/kaggle_master_trainer.ipynb"
108
+ echo " 3. Set TRAIN_LANG in Cell 3 (or export TRAIN_LANG=ful before launching)"
109
+ echo " 4. Run All Cells — Cell 3 auto-detects /workspace and uses RunPod defaults"
110
+ echo ""
111
+ echo "Checkpoints will be saved to: ${WORKSPACE}/adapter_\$TRAIN_LANG"
112
+ echo "This path is on the Volume disk — survives pod stop/restart."