feat: add Colab bundle script and pre-training notebook

scripts/make_colab_zip.py packages src/, scripts/pretrain.py, requirements.txt, and processed .pt files into hamori_colab.zip, remapping data/processed/{train,val}/ -> data/processed/mcgill/{train,val}/ so pretrain.py finds the data without modification. notebooks/colab_pretrain.ipynb guides through upload, extraction, dependency install, training run, report display, and results download. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-20 13:00:03 +03:00
parent 0682ccc140
commit 89770dd009
2 changed files with 282 additions and 0 deletions
@@ -0,0 +1,178 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 5,
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.11.0"
+  },
+  "colab": {
+   "provenance": [],
+   "gpuType": "T4"
+  },
+  "accelerator": "GPU"
+ },
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "title",
+   "metadata": {},
+   "source": [
+    "# hamori — pre-training on McGill Billboard corpus\n",
+    "\n",
+    "This notebook runs `scripts/pretrain.py` on Google Colab (GPU T4 recommended).\n",
+    "\n",
+    "**Steps:**\n",
+    "1. Check GPU\n",
+    "2. Upload `hamori_colab.zip` (built locally via `python scripts/make_colab_zip.py`)\n",
+    "3. Extract and install dependencies\n",
+    "4. Run pre-training\n",
+    "5. Download checkpoint and logs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "gpu-check",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ── 1. GPU check ────────────────────────────────────────────────────────────\n",
+    "import torch\n",
+    "if torch.cuda.is_available():\n",
+    "    print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n",
+    "    print(f\"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB\")\n",
+    "else:\n",
+    "    print(\"No GPU found — training will be slow on CPU.\")\n",
+    "    print(\"Go to Runtime → Change runtime type → T4 GPU and re-run.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "upload",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ── 2. Upload hamori_colab.zip ───────────────────────────────────────────────\n",
+    "# Build it locally first:  python scripts/make_colab_zip.py\n",
+    "from google.colab import files\n",
+    "uploaded = files.upload()   # select hamori_colab.zip from your machine\n",
+    "print(\"Uploaded:\", list(uploaded.keys()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "extract",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ── 3. Extract and install dependencies ─────────────────────────────────────\n",
+    "import zipfile, os\n",
+    "\n",
+    "WORK_DIR = \"/content/hamori\"\n",
+    "os.makedirs(WORK_DIR, exist_ok=True)\n",
+    "\n",
+    "with zipfile.ZipFile(\"hamori_colab.zip\") as zf:\n",
+    "    zf.extractall(WORK_DIR)\n",
+    "    print(f\"Extracted {len(zf.namelist())} files to {WORK_DIR}\")\n",
+    "\n",
+    "os.chdir(WORK_DIR)\n",
+    "print(\"Working directory:\", os.getcwd())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "install-deps",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Colab ships torch; only install the extra deps\n",
+    "!pip install -q pretty_midi mido matplotlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "data-check",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ── 4. Verify data ───────────────────────────────────────────────────────────\n",
+    "from pathlib import Path\n",
+    "train_pt = list(Path(\"data/processed/mcgill/train\").glob(\"*.pt\"))\n",
+    "val_pt   = list(Path(\"data/processed/mcgill/val\").glob(\"*.pt\"))\n",
+    "print(f\"Train: {len(train_pt)} files\")\n",
+    "print(f\"Val:   {len(val_pt)} files\")\n",
+    "if not train_pt:\n",
+    "    print(\"ERROR: no training data found — did you build the zip with data included?\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "pretrain",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ── 5. Pre-train ─────────────────────────────────────────────────────────────\n",
+    "# Outputs:\n",
+    "#   checkpoints/pretrained.pt\n",
+    "#   checkpoints/pretrained.log.csv\n",
+    "#   checkpoints/pretrained_curves.png\n",
+    "#   checkpoints/pretrained.report.txt\n",
+    "!python scripts/pretrain.py"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "show-report",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ── 6. Show report ───────────────────────────────────────────────────────────\n",
+    "report = Path(\"checkpoints/pretrained.report.txt\")\n",
+    "if report.exists():\n",
+    "    print(report.read_text(encoding=\"utf-8\"))\n",
+    "else:\n",
+    "    print(\"Report not found — training may have failed.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "show-curves",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Show loss curves inline\n",
+    "from IPython.display import Image\n",
+    "Image(\"checkpoints/pretrained_curves.png\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "download",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ── 7. Download results ──────────────────────────────────────────────────────\n",
+    "import shutil\n",
+    "from google.colab import files\n",
+    "\n",
+    "# Bundle all checkpoint outputs into a single zip for download\n",
+    "shutil.make_archive(\"/content/pretrain_results\", \"zip\", WORK_DIR, \"checkpoints\")\n",
+    "files.download(\"/content/pretrain_results.zip\")"
+   ]
+  }
+ ]
+}
@@ -0,0 +1,104 @@
+"""Package the hamori project for Google Colab pre-training.
+
+Creates hamori_colab.zip containing:
+  - src/                               (all Python modules)
+  - scripts/pretrain.py                (pre-training script)
+  - requirements.txt
+  - data/processed/mcgill/train/*.pt   (remapped from data/processed/train/)
+  - data/processed/mcgill/val/*.pt     (remapped from data/processed/val/)
+
+The local processed data lives at data/processed/{train,val}/ but pretrain.py
+expects data/processed/mcgill/{train,val}/.  This script remaps the paths
+inside the zip so no code changes are needed on Colab.
+
+Usage:
+    python scripts/make_colab_zip.py
+    python scripts/make_colab_zip.py --output my_bundle.zip
+    python scripts/make_colab_zip.py --no-data   # skip .pt files (code only)
+
+Output:
+    hamori_colab.zip  (in project root by default)
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+import zipfile
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parent.parent
+
+DEFAULT_OUT = ROOT / "hamori_colab.zip"
+
+# Files/dirs to include verbatim (paths relative to ROOT)
+VERBATIM: list[str] = [
+    "requirements.txt",
+    "scripts/pretrain.py",
+]
+
+SRC_DIR = ROOT / "src"
+
+# Local data dirs → path inside zip
+DATA_REMAP: list[tuple[Path, str]] = [
+    (ROOT / "data" / "processed" / "train", "data/processed/mcgill/train"),
+    (ROOT / "data" / "processed" / "val",   "data/processed/mcgill/val"),
+]
+
+
+def build_zip(out_path: Path, include_data: bool) -> None:
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    n_files = 0
+
+    with zipfile.ZipFile(out_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
+        # src/ — all .py files
+        for py in sorted(SRC_DIR.rglob("*.py")):
+            arc = "src/" + py.relative_to(SRC_DIR).as_posix()
+            zf.write(py, arc)
+            n_files += 1
+
+        # verbatim files
+        for rel in VERBATIM:
+            src = ROOT / rel
+            if not src.exists():
+                print(f"[warn] missing: {src} — skipped", file=sys.stderr)
+                continue
+            zf.write(src, rel)
+            n_files += 1
+
+        # data files with path remapping
+        if include_data:
+            for local_dir, arc_prefix in DATA_REMAP:
+                if not local_dir.exists():
+                    print(f"[warn] data dir not found: {local_dir} — skipped",
+                          file=sys.stderr)
+                    continue
+                pts = sorted(local_dir.glob("*.pt"))
+                for pt in pts:
+                    arc = f"{arc_prefix}/{pt.name}"
+                    zf.write(pt, arc)
+                    n_files += 1
+                print(f"[data] {arc_prefix}: {len(pts)} files")
+        else:
+            print("[data] skipped (--no-data)")
+
+    size_mb = out_path.stat().st_size / 1_048_576
+    print(f"[done] {out_path}  ({n_files} files, {size_mb:.1f} MB)")
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    ap.add_argument("--output", type=Path, default=DEFAULT_OUT,
+                    help="Output zip path (default: hamori_colab.zip)")
+    ap.add_argument("--no-data", action="store_true",
+                    help="Exclude .pt data files (bundle code only).")
+    args = ap.parse_args()
+
+    build_zip(args.output, include_data=not args.no_data)
+
+
+if __name__ == "__main__":
+    main()