From 89770dd0093d9a8e59d61c269f51c0dc5b6f8046 Mon Sep 17 00:00:00 2001 From: Masahiko AMANO Date: Wed, 20 May 2026 13:00:03 +0300 Subject: [PATCH] feat: add Colab bundle script and pre-training notebook scripts/make_colab_zip.py packages src/, scripts/pretrain.py, requirements.txt, and processed .pt files into hamori_colab.zip, remapping data/processed/{train,val}/ -> data/processed/mcgill/{train,val}/ so pretrain.py finds the data without modification. notebooks/colab_pretrain.ipynb guides through upload, extraction, dependency install, training run, report display, and results download. Co-Authored-By: Claude Sonnet 4.6 --- notebooks/colab_pretrain.ipynb | 178 +++++++++++++++++++++++++++++++++ scripts/make_colab_zip.py | 104 +++++++++++++++++++ 2 files changed, 282 insertions(+) create mode 100644 notebooks/colab_pretrain.ipynb create mode 100644 scripts/make_colab_zip.py diff --git a/notebooks/colab_pretrain.ipynb b/notebooks/colab_pretrain.ipynb new file mode 100644 index 0000000..336c91c --- /dev/null +++ b/notebooks/colab_pretrain.ipynb @@ -0,0 +1,178 @@ +{ + "nbformat": 4, + "nbformat_minor": 5, + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.0" + }, + "colab": { + "provenance": [], + "gpuType": "T4" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "markdown", + "id": "title", + "metadata": {}, + "source": [ + "# hamori — pre-training on McGill Billboard corpus\n", + "\n", + "This notebook runs `scripts/pretrain.py` on Google Colab (GPU T4 recommended).\n", + "\n", + "**Steps:**\n", + "1. Check GPU\n", + "2. Upload `hamori_colab.zip` (built locally via `python scripts/make_colab_zip.py`)\n", + "3. Extract and install dependencies\n", + "4. Run pre-training\n", + "5. Download checkpoint and logs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "gpu-check", + "metadata": {}, + "outputs": [], + "source": [ + "# ── 1. GPU check ────────────────────────────────────────────────────────────\n", + "import torch\n", + "if torch.cuda.is_available():\n", + " print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n", + " print(f\"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB\")\n", + "else:\n", + " print(\"No GPU found — training will be slow on CPU.\")\n", + " print(\"Go to Runtime → Change runtime type → T4 GPU and re-run.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "upload", + "metadata": {}, + "outputs": [], + "source": [ + "# ── 2. Upload hamori_colab.zip ───────────────────────────────────────────────\n", + "# Build it locally first: python scripts/make_colab_zip.py\n", + "from google.colab import files\n", + "uploaded = files.upload() # select hamori_colab.zip from your machine\n", + "print(\"Uploaded:\", list(uploaded.keys()))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "extract", + "metadata": {}, + "outputs": [], + "source": [ + "# ── 3. Extract and install dependencies ─────────────────────────────────────\n", + "import zipfile, os\n", + "\n", + "WORK_DIR = \"/content/hamori\"\n", + "os.makedirs(WORK_DIR, exist_ok=True)\n", + "\n", + "with zipfile.ZipFile(\"hamori_colab.zip\") as zf:\n", + " zf.extractall(WORK_DIR)\n", + " print(f\"Extracted {len(zf.namelist())} files to {WORK_DIR}\")\n", + "\n", + "os.chdir(WORK_DIR)\n", + "print(\"Working directory:\", os.getcwd())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "install-deps", + "metadata": {}, + "outputs": [], + "source": [ + "# Colab ships torch; only install the extra deps\n", + "!pip install -q pretty_midi mido matplotlib" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "data-check", + "metadata": {}, + "outputs": [], + "source": [ + "# ── 4. Verify data ───────────────────────────────────────────────────────────\n", + "from pathlib import Path\n", + "train_pt = list(Path(\"data/processed/mcgill/train\").glob(\"*.pt\"))\n", + "val_pt = list(Path(\"data/processed/mcgill/val\").glob(\"*.pt\"))\n", + "print(f\"Train: {len(train_pt)} files\")\n", + "print(f\"Val: {len(val_pt)} files\")\n", + "if not train_pt:\n", + " print(\"ERROR: no training data found — did you build the zip with data included?\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "pretrain", + "metadata": {}, + "outputs": [], + "source": [ + "# ── 5. Pre-train ─────────────────────────────────────────────────────────────\n", + "# Outputs:\n", + "# checkpoints/pretrained.pt\n", + "# checkpoints/pretrained.log.csv\n", + "# checkpoints/pretrained_curves.png\n", + "# checkpoints/pretrained.report.txt\n", + "!python scripts/pretrain.py" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "show-report", + "metadata": {}, + "outputs": [], + "source": [ + "# ── 6. Show report ───────────────────────────────────────────────────────────\n", + "report = Path(\"checkpoints/pretrained.report.txt\")\n", + "if report.exists():\n", + " print(report.read_text(encoding=\"utf-8\"))\n", + "else:\n", + " print(\"Report not found — training may have failed.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "show-curves", + "metadata": {}, + "outputs": [], + "source": [ + "# Show loss curves inline\n", + "from IPython.display import Image\n", + "Image(\"checkpoints/pretrained_curves.png\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "download", + "metadata": {}, + "outputs": [], + "source": [ + "# ── 7. Download results ──────────────────────────────────────────────────────\n", + "import shutil\n", + "from google.colab import files\n", + "\n", + "# Bundle all checkpoint outputs into a single zip for download\n", + "shutil.make_archive(\"/content/pretrain_results\", \"zip\", WORK_DIR, \"checkpoints\")\n", + "files.download(\"/content/pretrain_results.zip\")" + ] + } + ] +} diff --git a/scripts/make_colab_zip.py b/scripts/make_colab_zip.py new file mode 100644 index 0000000..bea0163 --- /dev/null +++ b/scripts/make_colab_zip.py @@ -0,0 +1,104 @@ +"""Package the hamori project for Google Colab pre-training. + +Creates hamori_colab.zip containing: + - src/ (all Python modules) + - scripts/pretrain.py (pre-training script) + - requirements.txt + - data/processed/mcgill/train/*.pt (remapped from data/processed/train/) + - data/processed/mcgill/val/*.pt (remapped from data/processed/val/) + +The local processed data lives at data/processed/{train,val}/ but pretrain.py +expects data/processed/mcgill/{train,val}/. This script remaps the paths +inside the zip so no code changes are needed on Colab. + +Usage: + python scripts/make_colab_zip.py + python scripts/make_colab_zip.py --output my_bundle.zip + python scripts/make_colab_zip.py --no-data # skip .pt files (code only) + +Output: + hamori_colab.zip (in project root by default) +""" + +from __future__ import annotations + +import argparse +import sys +import zipfile +from pathlib import Path + +ROOT = Path(__file__).resolve().parent.parent + +DEFAULT_OUT = ROOT / "hamori_colab.zip" + +# Files/dirs to include verbatim (paths relative to ROOT) +VERBATIM: list[str] = [ + "requirements.txt", + "scripts/pretrain.py", +] + +SRC_DIR = ROOT / "src" + +# Local data dirs → path inside zip +DATA_REMAP: list[tuple[Path, str]] = [ + (ROOT / "data" / "processed" / "train", "data/processed/mcgill/train"), + (ROOT / "data" / "processed" / "val", "data/processed/mcgill/val"), +] + + +def build_zip(out_path: Path, include_data: bool) -> None: + out_path.parent.mkdir(parents=True, exist_ok=True) + n_files = 0 + + with zipfile.ZipFile(out_path, "w", compression=zipfile.ZIP_DEFLATED) as zf: + # src/ — all .py files + for py in sorted(SRC_DIR.rglob("*.py")): + arc = "src/" + py.relative_to(SRC_DIR).as_posix() + zf.write(py, arc) + n_files += 1 + + # verbatim files + for rel in VERBATIM: + src = ROOT / rel + if not src.exists(): + print(f"[warn] missing: {src} — skipped", file=sys.stderr) + continue + zf.write(src, rel) + n_files += 1 + + # data files with path remapping + if include_data: + for local_dir, arc_prefix in DATA_REMAP: + if not local_dir.exists(): + print(f"[warn] data dir not found: {local_dir} — skipped", + file=sys.stderr) + continue + pts = sorted(local_dir.glob("*.pt")) + for pt in pts: + arc = f"{arc_prefix}/{pt.name}" + zf.write(pt, arc) + n_files += 1 + print(f"[data] {arc_prefix}: {len(pts)} files") + else: + print("[data] skipped (--no-data)") + + size_mb = out_path.stat().st_size / 1_048_576 + print(f"[done] {out_path} ({n_files} files, {size_mb:.1f} MB)") + + +def main() -> None: + ap = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + ap.add_argument("--output", type=Path, default=DEFAULT_OUT, + help="Output zip path (default: hamori_colab.zip)") + ap.add_argument("--no-data", action="store_true", + help="Exclude .pt data files (bundle code only).") + args = ap.parse_args() + + build_zip(args.output, include_data=not args.no_data) + + +if __name__ == "__main__": + main()