scripts: add --mode finetune to make_colab_zip, add colab_finetune notebook

make_colab_zip.py now accepts --mode pretrain|finetune (default: pretrain). Finetune mode bundles scripts/train.py + data/processed/user/{train,val}/*.pt plus an optional --include-checkpoint flag for pretrained.pt. notebooks/colab_finetune.ipynb covers the full Colab fine-tuning workflow: upload zip → upload pretrained.pt → verify data → train → inspect → download. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 19:47:10 +03:00
parent 6bce48ddf4
commit 8f657ca916
2 changed files with 299 additions and 33 deletions
@@ -0,0 +1,212 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 5,
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.11.0"
+  }
+ },
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "title",
+   "metadata": {},
+   "source": [
+    "# hamori — fine-tuning on personal chord corpus\n",
+    "\n",
+    "This notebook fine-tunes a pre-trained ChordTransformer on your tokenized `.pt` files using Google Colab (GPU T4 recommended).\n",
+    "\n",
+    "**Prerequisites (done locally before uploading):**\n",
+    "- `python scripts/prepare_data.py --input-dir data/raw_user --output-dir data/processed/user`\n",
+    "- `python scripts/make_colab_zip.py --mode finetune`\n",
+    "- Have `checkpoints/pretrained.pt` from a completed pre-training run.\n",
+    "\n",
+    "**Steps:**\n",
+    "1. Check GPU\n",
+    "2. Upload `hamori_colab_finetune.zip`\n",
+    "3. Extract and install dependencies\n",
+    "4. Upload `pretrained.pt` checkpoint\n",
+    "5. Verify processed data\n",
+    "6. Run fine-tuning\n",
+    "7. Inspect results\n",
+    "8. Download checkpoint and logs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "gpu-check",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ── 1. GPU check ────────────────────────────────────────────────────────────\n",
+    "import torch\n",
+    "if torch.cuda.is_available():\n",
+    "    print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n",
+    "    print(f\"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB\")\n",
+    "else:\n",
+    "    print(\"No GPU found — training will be slow on CPU.\")\n",
+    "    print(\"Go to Runtime → Change runtime type → T4 GPU and re-run.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "upload-zip",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ── 2. Upload hamori_colab_finetune.zip ──────────────────────────────────────\n",
+    "# Build it locally first:\n",
+    "#   python scripts/make_colab_zip.py --mode finetune\n",
+    "from google.colab import files\n",
+    "uploaded = files.upload()   # select hamori_colab_finetune.zip\n",
+    "print(\"Uploaded:\", list(uploaded.keys()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "extract",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ── 3. Extract and install dependencies ─────────────────────────────────────\n",
+    "import zipfile, os\n",
+    "\n",
+    "WORK_DIR = \"/content/hamori\"\n",
+    "os.makedirs(WORK_DIR, exist_ok=True)\n",
+    "\n",
+    "zip_name = [k for k in uploaded if k.endswith(\".zip\")][0]\n",
+    "with zipfile.ZipFile(zip_name) as zf:\n",
+    "    zf.extractall(WORK_DIR)\n",
+    "    print(f\"Extracted {len(zf.namelist())} files to {WORK_DIR}\")\n",
+    "\n",
+    "os.chdir(WORK_DIR)\n",
+    "print(\"Working directory:\", os.getcwd())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "install-deps",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Colab ships torch; only install the extra deps\n",
+    "!pip install -q pretty_midi mido music21 matplotlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "upload-checkpoint",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ── 4. Upload pretrained checkpoint ─────────────────────────────────────────\n",
+    "# Skip this cell if you built the zip with --include-checkpoint.\n",
+    "import os\n",
+    "from pathlib import Path\n",
+    "from google.colab import files\n",
+    "\n",
+    "ckpt_path = Path(\"checkpoints/pretrained.pt\")\n",
+    "if ckpt_path.exists():\n",
+    "    print(f\"Checkpoint already present: {ckpt_path} ({ckpt_path.stat().st_size / 1e6:.1f} MB)\")\n",
+    "else:\n",
+    "    print(\"Upload checkpoints/pretrained.pt from your local machine.\")\n",
+    "    uploaded_ckpt = files.upload()   # select pretrained.pt\n",
+    "    ckpt_path.parent.mkdir(parents=True, exist_ok=True)\n",
+    "    src = list(uploaded_ckpt.keys())[0]\n",
+    "    os.rename(src, ckpt_path)\n",
+    "    print(f\"Saved to {ckpt_path} ({ckpt_path.stat().st_size / 1e6:.1f} MB)\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "verify-data",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ── 5. Verify processed user corpus ─────────────────────────────────────────\n",
+    "from pathlib import Path\n",
+    "train_pt = list(Path(\"data/processed/user/train\").glob(\"*.pt\"))\n",
+    "val_pt   = list(Path(\"data/processed/user/val\").glob(\"*.pt\"))\n",
+    "print(f\"Train: {len(train_pt)} files\")\n",
+    "print(f\"Val:   {len(val_pt)} files\")\n",
+    "if not train_pt:\n",
+    "    print()\n",
+    "    print(\"ERROR: no training data found.\")\n",
+    "    print(\"Run locally first: python scripts/prepare_data.py \")\n",
+    "    print(\"  --input-dir data/raw_user --output-dir data/processed/user\")\n",
+    "    print(\"Then rebuild the zip: python scripts/make_colab_zip.py --mode finetune\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "finetune",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ── 6. Fine-tune ─────────────────────────────────────────────────────────────\n",
+    "# Outputs:\n",
+    "#   checkpoints/finetuned.pt\n",
+    "#   checkpoints/finetuned.log.csv\n",
+    "#   checkpoints/finetuned_curves.png\n",
+    "#   checkpoints/finetuned.report.txt\n",
+    "!python scripts/train.py"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "show-report",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ── 7a. Show report ───────────────────────────────────────────────────────────\n",
+    "from pathlib import Path\n",
+    "report = Path(\"checkpoints/finetuned.report.txt\")\n",
+    "if report.exists():\n",
+    "    print(report.read_text(encoding=\"utf-8\"))\n",
+    "else:\n",
+    "    print(\"Report not found — training may have failed.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "show-curves",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ── 7b. Show loss curves ─────────────────────────────────────────────────────\n",
+    "from IPython.display import Image\n",
+    "Image(\"checkpoints/finetuned_curves.png\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "download",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ── 8. Download results ───────────────────────────────────────────────────────\n",
+    "import shutil\n",
+    "from google.colab import files\n",
+    "\n",
+    "shutil.make_archive(\"/content/finetune_results\", \"zip\", WORK_DIR, \"checkpoints\")\n",
+    "files.download(\"/content/finetune_results.zip\")"
+   ]
+  }
+ ]
+}
@@ -1,23 +1,40 @@
-"""Package the hamori project for Google Colab pre-training.
+"""Package the hamori project for Google Colab pre-training or fine-tuning.

-Creates hamori_colab.zip containing:
+pretrain mode (default):
  - src/                               (all Python modules)
-  - scripts/pretrain.py                (pre-training script)
+  - scripts/pretrain.py
  - requirements.txt
  - data/processed/mcgill/train/*.pt   (remapped from data/processed/train/)
  - data/processed/mcgill/val/*.pt     (remapped from data/processed/val/)

-The local processed data lives at data/processed/{train,val}/ but pretrain.py
-expects data/processed/mcgill/{train,val}/.  This script remaps the paths
-inside the zip so no code changes are needed on Colab.
+finetune mode:
+  - src/                               (all Python modules)
+  - scripts/train.py
+  - requirements.txt
+  - data/processed/user/train/*.pt
+  - data/processed/user/val/*.pt
+  - checkpoints/pretrained.pt          (only with --include-checkpoint)

 Usage:
+    # Pre-training bundle (default)
    python scripts/make_colab_zip.py
-    python scripts/make_colab_zip.py --output my_bundle.zip
-    python scripts/make_colab_zip.py --no-data   # skip .pt files (code only)
+    python scripts/make_colab_zip.py --mode pretrain

-Output:
-    hamori_colab.zip  (in project root by default)
+    # Fine-tuning bundle (run prepare_data.py locally first)
+    python scripts/make_colab_zip.py --mode finetune
+
+    # Fine-tuning bundle with pretrained checkpoint included
+    python scripts/make_colab_zip.py --mode finetune --include-checkpoint
+
+    # Exclude data files (code only)
+    python scripts/make_colab_zip.py --mode finetune --no-data
+
+    # Custom output path
+    python scripts/make_colab_zip.py --mode pretrain --output my_bundle.zip
+
+Outputs:
+    output/hamori_colab_pretrain.zip  (pretrain mode)
+    output/hamori_colab_finetune.zip  (finetune mode)
 """

 from __future__ import annotations
@@ -28,28 +45,37 @@ import zipfile
 from pathlib import Path

 ROOT = Path(__file__).resolve().parent.parent
-
-DEFAULT_OUT = ROOT / "hamori_colab.zip"
-
-# Files/dirs to include verbatim (paths relative to ROOT)
-VERBATIM: list[str] = [
-    "requirements.txt",
-    "scripts/pretrain.py",
-]
-
 SRC_DIR = ROOT / "src"
+OUT_DIR = ROOT / "output"

-# Local data dirs → path inside zip
-DATA_REMAP: list[tuple[Path, str]] = [
+COMMON_VERBATIM: list[str] = ["requirements.txt"]
+
+MODE_SCRIPTS: dict[str, list[str]] = {
+    "pretrain": ["scripts/pretrain.py"],
+    "finetune": ["scripts/train.py"],
+}
+
+# Local dir → arc path inside zip
+PRETRAIN_DATA: list[tuple[Path, str]] = [
    (ROOT / "data" / "processed" / "train", "data/processed/mcgill/train"),
    (ROOT / "data" / "processed" / "val",   "data/processed/mcgill/val"),
 ]

+FINETUNE_DATA: list[tuple[Path, str]] = [
+    (ROOT / "data" / "processed" / "user" / "train", "data/processed/user/train"),
+    (ROOT / "data" / "processed" / "user" / "val",   "data/processed/user/val"),
+]

-def build_zip(out_path: Path, include_data: bool) -> None:
+PRETRAINED_CKPT = ROOT / "checkpoints" / "pretrained.pt"
+
+
+def build_zip(out_path: Path, mode: str, include_data: bool,
+              include_checkpoint: bool) -> None:
    out_path.parent.mkdir(parents=True, exist_ok=True)
    n_files = 0

+    data_map = PRETRAIN_DATA if mode == "pretrain" else FINETUNE_DATA
+
    with zipfile.ZipFile(out_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
        # src/ — all .py files
        for py in sorted(SRC_DIR.rglob("*.py")):
@@ -57,8 +83,8 @@ def build_zip(out_path: Path, include_data: bool) -> None:
            zf.write(py, arc)
            n_files += 1

-        # verbatim files
-        for rel in VERBATIM:
+        # common + mode-specific scripts
+        for rel in COMMON_VERBATIM + MODE_SCRIPTS[mode]:
            src = ROOT / rel
            if not src.exists():
                print(f"[warn] missing: {src} — skipped", file=sys.stderr)
@@ -66,22 +92,32 @@ def build_zip(out_path: Path, include_data: bool) -> None:
            zf.write(src, rel)
            n_files += 1

-        # data files with path remapping
+        # data
        if include_data:
-            for local_dir, arc_prefix in DATA_REMAP:
+            for local_dir, arc_prefix in data_map:
                if not local_dir.exists():
                    print(f"[warn] data dir not found: {local_dir} — skipped",
                          file=sys.stderr)
                    continue
                pts = sorted(local_dir.glob("*.pt"))
                for pt in pts:
-                    arc = f"{arc_prefix}/{pt.name}"
-                    zf.write(pt, arc)
+                    zf.write(pt, f"{arc_prefix}/{pt.name}")
                    n_files += 1
                print(f"[data] {arc_prefix}: {len(pts)} files")
        else:
            print("[data] skipped (--no-data)")

+        # pretrained checkpoint (finetune mode only, opt-in)
+        if mode == "finetune" and include_checkpoint:
+            if not PRETRAINED_CKPT.exists():
+                print(f"[warn] checkpoint not found: {PRETRAINED_CKPT} — skipped",
+                      file=sys.stderr)
+            else:
+                zf.write(PRETRAINED_CKPT, "checkpoints/pretrained.pt")
+                n_files += 1
+                ckpt_mb = PRETRAINED_CKPT.stat().st_size / 1_048_576
+                print(f"[ckpt] checkpoints/pretrained.pt ({ckpt_mb:.1f} MB)")
+
    size_mb = out_path.stat().st_size / 1_048_576
    print(f"[done] {out_path}  ({n_files} files, {size_mb:.1f} MB)")

@@ -91,13 +127,31 @@ def main() -> None:
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
-    ap.add_argument("--output", type=Path, default=DEFAULT_OUT,
-                    help="Output zip path (default: hamori_colab.zip)")
-    ap.add_argument("--no-data", action="store_true",
-                    help="Exclude .pt data files (bundle code only).")
+    ap.add_argument(
+        "--mode", choices=["pretrain", "finetune"], default="pretrain",
+        help="Bundle mode: 'pretrain' (default) or 'finetune'.",
+    )
+    ap.add_argument(
+        "--output", type=Path, default=None,
+        help="Output zip path. Default: output/hamori_colab_<mode>.zip.",
+    )
+    ap.add_argument(
+        "--no-data", action="store_true",
+        help="Exclude data files (code-only bundle).",
+    )
+    ap.add_argument(
+        "--include-checkpoint", action="store_true", dest="include_checkpoint",
+        help="(finetune mode) Include checkpoints/pretrained.pt in the zip.",
+    )
    args = ap.parse_args()

-    build_zip(args.output, include_data=not args.no_data)
+    out_path = args.output or OUT_DIR / f"hamori_colab_{args.mode}.zip"
+    build_zip(
+        out_path=out_path,
+        mode=args.mode,
+        include_data=not args.no_data,
+        include_checkpoint=args.include_checkpoint,
+    )


 if __name__ == "__main__":