feat: add Colab bundle script and pre-training notebook
scripts/make_colab_zip.py packages src/, scripts/pretrain.py,
requirements.txt, and processed .pt files into hamori_colab.zip,
remapping data/processed/{train,val}/ -> data/processed/mcgill/{train,val}/
so pretrain.py finds the data without modification.
notebooks/colab_pretrain.ipynb guides through upload, extraction,
dependency install, training run, report display, and results download.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,178 @@
|
||||
{
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5,
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python",
|
||||
"version": "3.11.0"
|
||||
},
|
||||
"colab": {
|
||||
"provenance": [],
|
||||
"gpuType": "T4"
|
||||
},
|
||||
"accelerator": "GPU"
|
||||
},
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "title",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# hamori — pre-training on McGill Billboard corpus\n",
|
||||
"\n",
|
||||
"This notebook runs `scripts/pretrain.py` on Google Colab (GPU T4 recommended).\n",
|
||||
"\n",
|
||||
"**Steps:**\n",
|
||||
"1. Check GPU\n",
|
||||
"2. Upload `hamori_colab.zip` (built locally via `python scripts/make_colab_zip.py`)\n",
|
||||
"3. Extract and install dependencies\n",
|
||||
"4. Run pre-training\n",
|
||||
"5. Download checkpoint and logs"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "gpu-check",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# ── 1. GPU check ────────────────────────────────────────────────────────────\n",
|
||||
"import torch\n",
|
||||
"if torch.cuda.is_available():\n",
|
||||
" print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n",
|
||||
" print(f\"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB\")\n",
|
||||
"else:\n",
|
||||
" print(\"No GPU found — training will be slow on CPU.\")\n",
|
||||
" print(\"Go to Runtime → Change runtime type → T4 GPU and re-run.\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "upload",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# ── 2. Upload hamori_colab.zip ───────────────────────────────────────────────\n",
|
||||
"# Build it locally first: python scripts/make_colab_zip.py\n",
|
||||
"from google.colab import files\n",
|
||||
"uploaded = files.upload() # select hamori_colab.zip from your machine\n",
|
||||
"print(\"Uploaded:\", list(uploaded.keys()))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "extract",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# ── 3. Extract and install dependencies ─────────────────────────────────────\n",
|
||||
"import zipfile, os\n",
|
||||
"\n",
|
||||
"WORK_DIR = \"/content/hamori\"\n",
|
||||
"os.makedirs(WORK_DIR, exist_ok=True)\n",
|
||||
"\n",
|
||||
"with zipfile.ZipFile(\"hamori_colab.zip\") as zf:\n",
|
||||
" zf.extractall(WORK_DIR)\n",
|
||||
" print(f\"Extracted {len(zf.namelist())} files to {WORK_DIR}\")\n",
|
||||
"\n",
|
||||
"os.chdir(WORK_DIR)\n",
|
||||
"print(\"Working directory:\", os.getcwd())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "install-deps",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Colab ships torch; only install the extra deps\n",
|
||||
"!pip install -q pretty_midi mido matplotlib"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "data-check",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# ── 4. Verify data ───────────────────────────────────────────────────────────\n",
|
||||
"from pathlib import Path\n",
|
||||
"train_pt = list(Path(\"data/processed/mcgill/train\").glob(\"*.pt\"))\n",
|
||||
"val_pt = list(Path(\"data/processed/mcgill/val\").glob(\"*.pt\"))\n",
|
||||
"print(f\"Train: {len(train_pt)} files\")\n",
|
||||
"print(f\"Val: {len(val_pt)} files\")\n",
|
||||
"if not train_pt:\n",
|
||||
" print(\"ERROR: no training data found — did you build the zip with data included?\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "pretrain",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# ── 5. Pre-train ─────────────────────────────────────────────────────────────\n",
|
||||
"# Outputs:\n",
|
||||
"# checkpoints/pretrained.pt\n",
|
||||
"# checkpoints/pretrained.log.csv\n",
|
||||
"# checkpoints/pretrained_curves.png\n",
|
||||
"# checkpoints/pretrained.report.txt\n",
|
||||
"!python scripts/pretrain.py"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "show-report",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# ── 6. Show report ───────────────────────────────────────────────────────────\n",
|
||||
"report = Path(\"checkpoints/pretrained.report.txt\")\n",
|
||||
"if report.exists():\n",
|
||||
" print(report.read_text(encoding=\"utf-8\"))\n",
|
||||
"else:\n",
|
||||
" print(\"Report not found — training may have failed.\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "show-curves",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Show loss curves inline\n",
|
||||
"from IPython.display import Image\n",
|
||||
"Image(\"checkpoints/pretrained_curves.png\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "download",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# ── 7. Download results ──────────────────────────────────────────────────────\n",
|
||||
"import shutil\n",
|
||||
"from google.colab import files\n",
|
||||
"\n",
|
||||
"# Bundle all checkpoint outputs into a single zip for download\n",
|
||||
"shutil.make_archive(\"/content/pretrain_results\", \"zip\", WORK_DIR, \"checkpoints\")\n",
|
||||
"files.download(\"/content/pretrain_results.zip\")"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,104 @@
|
||||
"""Package the hamori project for Google Colab pre-training.
|
||||
|
||||
Creates hamori_colab.zip containing:
|
||||
- src/ (all Python modules)
|
||||
- scripts/pretrain.py (pre-training script)
|
||||
- requirements.txt
|
||||
- data/processed/mcgill/train/*.pt (remapped from data/processed/train/)
|
||||
- data/processed/mcgill/val/*.pt (remapped from data/processed/val/)
|
||||
|
||||
The local processed data lives at data/processed/{train,val}/ but pretrain.py
|
||||
expects data/processed/mcgill/{train,val}/. This script remaps the paths
|
||||
inside the zip so no code changes are needed on Colab.
|
||||
|
||||
Usage:
|
||||
python scripts/make_colab_zip.py
|
||||
python scripts/make_colab_zip.py --output my_bundle.zip
|
||||
python scripts/make_colab_zip.py --no-data # skip .pt files (code only)
|
||||
|
||||
Output:
|
||||
hamori_colab.zip (in project root by default)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
|
||||
ROOT = Path(__file__).resolve().parent.parent
|
||||
|
||||
DEFAULT_OUT = ROOT / "hamori_colab.zip"
|
||||
|
||||
# Files/dirs to include verbatim (paths relative to ROOT)
|
||||
VERBATIM: list[str] = [
|
||||
"requirements.txt",
|
||||
"scripts/pretrain.py",
|
||||
]
|
||||
|
||||
SRC_DIR = ROOT / "src"
|
||||
|
||||
# Local data dirs → path inside zip
|
||||
DATA_REMAP: list[tuple[Path, str]] = [
|
||||
(ROOT / "data" / "processed" / "train", "data/processed/mcgill/train"),
|
||||
(ROOT / "data" / "processed" / "val", "data/processed/mcgill/val"),
|
||||
]
|
||||
|
||||
|
||||
def build_zip(out_path: Path, include_data: bool) -> None:
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
n_files = 0
|
||||
|
||||
with zipfile.ZipFile(out_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
|
||||
# src/ — all .py files
|
||||
for py in sorted(SRC_DIR.rglob("*.py")):
|
||||
arc = "src/" + py.relative_to(SRC_DIR).as_posix()
|
||||
zf.write(py, arc)
|
||||
n_files += 1
|
||||
|
||||
# verbatim files
|
||||
for rel in VERBATIM:
|
||||
src = ROOT / rel
|
||||
if not src.exists():
|
||||
print(f"[warn] missing: {src} — skipped", file=sys.stderr)
|
||||
continue
|
||||
zf.write(src, rel)
|
||||
n_files += 1
|
||||
|
||||
# data files with path remapping
|
||||
if include_data:
|
||||
for local_dir, arc_prefix in DATA_REMAP:
|
||||
if not local_dir.exists():
|
||||
print(f"[warn] data dir not found: {local_dir} — skipped",
|
||||
file=sys.stderr)
|
||||
continue
|
||||
pts = sorted(local_dir.glob("*.pt"))
|
||||
for pt in pts:
|
||||
arc = f"{arc_prefix}/{pt.name}"
|
||||
zf.write(pt, arc)
|
||||
n_files += 1
|
||||
print(f"[data] {arc_prefix}: {len(pts)} files")
|
||||
else:
|
||||
print("[data] skipped (--no-data)")
|
||||
|
||||
size_mb = out_path.stat().st_size / 1_048_576
|
||||
print(f"[done] {out_path} ({n_files} files, {size_mb:.1f} MB)")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
ap = argparse.ArgumentParser(
|
||||
description=__doc__,
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
ap.add_argument("--output", type=Path, default=DEFAULT_OUT,
|
||||
help="Output zip path (default: hamori_colab.zip)")
|
||||
ap.add_argument("--no-data", action="store_true",
|
||||
help="Exclude .pt data files (bundle code only).")
|
||||
args = ap.parse_args()
|
||||
|
||||
build_zip(args.output, include_data=not args.no_data)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user