Files
hamori/notebooks/colab_pretrain.ipynb
H1K0 89770dd009 feat: add Colab bundle script and pre-training notebook
scripts/make_colab_zip.py packages src/, scripts/pretrain.py,
requirements.txt, and processed .pt files into hamori_colab.zip,
remapping data/processed/{train,val}/ -> data/processed/mcgill/{train,val}/
so pretrain.py finds the data without modification.

notebooks/colab_pretrain.ipynb guides through upload, extraction,
dependency install, training run, report display, and results download.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-20 13:00:03 +03:00

179 lines
5.8 KiB
Plaintext

{
"nbformat": 4,
"nbformat_minor": 5,
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.11.0"
},
"colab": {
"provenance": [],
"gpuType": "T4"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "markdown",
"id": "title",
"metadata": {},
"source": [
"# hamori — pre-training on McGill Billboard corpus\n",
"\n",
"This notebook runs `scripts/pretrain.py` on Google Colab (GPU T4 recommended).\n",
"\n",
"**Steps:**\n",
"1. Check GPU\n",
"2. Upload `hamori_colab.zip` (built locally via `python scripts/make_colab_zip.py`)\n",
"3. Extract and install dependencies\n",
"4. Run pre-training\n",
"5. Download checkpoint and logs"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "gpu-check",
"metadata": {},
"outputs": [],
"source": [
"# ── 1. GPU check ────────────────────────────────────────────────────────────\n",
"import torch\n",
"if torch.cuda.is_available():\n",
" print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n",
" print(f\"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB\")\n",
"else:\n",
" print(\"No GPU found — training will be slow on CPU.\")\n",
" print(\"Go to Runtime → Change runtime type → T4 GPU and re-run.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "upload",
"metadata": {},
"outputs": [],
"source": [
"# ── 2. Upload hamori_colab.zip ───────────────────────────────────────────────\n",
"# Build it locally first: python scripts/make_colab_zip.py\n",
"from google.colab import files\n",
"uploaded = files.upload() # select hamori_colab.zip from your machine\n",
"print(\"Uploaded:\", list(uploaded.keys()))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "extract",
"metadata": {},
"outputs": [],
"source": [
"# ── 3. Extract and install dependencies ─────────────────────────────────────\n",
"import zipfile, os\n",
"\n",
"WORK_DIR = \"/content/hamori\"\n",
"os.makedirs(WORK_DIR, exist_ok=True)\n",
"\n",
"with zipfile.ZipFile(\"hamori_colab.zip\") as zf:\n",
" zf.extractall(WORK_DIR)\n",
" print(f\"Extracted {len(zf.namelist())} files to {WORK_DIR}\")\n",
"\n",
"os.chdir(WORK_DIR)\n",
"print(\"Working directory:\", os.getcwd())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "install-deps",
"metadata": {},
"outputs": [],
"source": [
"# Colab ships torch; only install the extra deps\n",
"!pip install -q pretty_midi mido matplotlib"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "data-check",
"metadata": {},
"outputs": [],
"source": [
"# ── 4. Verify data ───────────────────────────────────────────────────────────\n",
"from pathlib import Path\n",
"train_pt = list(Path(\"data/processed/mcgill/train\").glob(\"*.pt\"))\n",
"val_pt = list(Path(\"data/processed/mcgill/val\").glob(\"*.pt\"))\n",
"print(f\"Train: {len(train_pt)} files\")\n",
"print(f\"Val: {len(val_pt)} files\")\n",
"if not train_pt:\n",
" print(\"ERROR: no training data found — did you build the zip with data included?\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "pretrain",
"metadata": {},
"outputs": [],
"source": [
"# ── 5. Pre-train ─────────────────────────────────────────────────────────────\n",
"# Outputs:\n",
"# checkpoints/pretrained.pt\n",
"# checkpoints/pretrained.log.csv\n",
"# checkpoints/pretrained_curves.png\n",
"# checkpoints/pretrained.report.txt\n",
"!python scripts/pretrain.py"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "show-report",
"metadata": {},
"outputs": [],
"source": [
"# ── 6. Show report ───────────────────────────────────────────────────────────\n",
"report = Path(\"checkpoints/pretrained.report.txt\")\n",
"if report.exists():\n",
" print(report.read_text(encoding=\"utf-8\"))\n",
"else:\n",
" print(\"Report not found — training may have failed.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "show-curves",
"metadata": {},
"outputs": [],
"source": [
"# Show loss curves inline\n",
"from IPython.display import Image\n",
"Image(\"checkpoints/pretrained_curves.png\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "download",
"metadata": {},
"outputs": [],
"source": [
"# ── 7. Download results ──────────────────────────────────────────────────────\n",
"import shutil\n",
"from google.colab import files\n",
"\n",
"# Bundle all checkpoint outputs into a single zip for download\n",
"shutil.make_archive(\"/content/pretrain_results\", \"zip\", WORK_DIR, \"checkpoints\")\n",
"files.download(\"/content/pretrain_results.zip\")"
]
}
]
}