From 8f657ca916c13c5d7592988bd8d2e39231d35317 Mon Sep 17 00:00:00 2001 From: Masahiko AMANO Date: Thu, 21 May 2026 19:47:10 +0300 Subject: [PATCH] scripts: add --mode finetune to make_colab_zip, add colab_finetune notebook MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit make_colab_zip.py now accepts --mode pretrain|finetune (default: pretrain). Finetune mode bundles scripts/train.py + data/processed/user/{train,val}/*.pt plus an optional --include-checkpoint flag for pretrained.pt. notebooks/colab_finetune.ipynb covers the full Colab fine-tuning workflow: upload zip → upload pretrained.pt → verify data → train → inspect → download. Co-Authored-By: Claude Sonnet 4.6 --- notebooks/colab_finetune.ipynb | 212 +++++++++++++++++++++++++++++++++ scripts/make_colab_zip.py | 120 ++++++++++++++----- 2 files changed, 299 insertions(+), 33 deletions(-) create mode 100644 notebooks/colab_finetune.ipynb diff --git a/notebooks/colab_finetune.ipynb b/notebooks/colab_finetune.ipynb new file mode 100644 index 0000000..d92d805 --- /dev/null +++ b/notebooks/colab_finetune.ipynb @@ -0,0 +1,212 @@ +{ + "nbformat": 4, + "nbformat_minor": 5, + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.0" + } + }, + "cells": [ + { + "cell_type": "markdown", + "id": "title", + "metadata": {}, + "source": [ + "# hamori — fine-tuning on personal chord corpus\n", + "\n", + "This notebook fine-tunes a pre-trained ChordTransformer on your tokenized `.pt` files using Google Colab (GPU T4 recommended).\n", + "\n", + "**Prerequisites (done locally before uploading):**\n", + "- `python scripts/prepare_data.py --input-dir data/raw_user --output-dir data/processed/user`\n", + "- `python scripts/make_colab_zip.py --mode finetune`\n", + "- Have `checkpoints/pretrained.pt` from a completed pre-training run.\n", + "\n", + "**Steps:**\n", + "1. Check GPU\n", + "2. Upload `hamori_colab_finetune.zip`\n", + "3. Extract and install dependencies\n", + "4. Upload `pretrained.pt` checkpoint\n", + "5. Verify processed data\n", + "6. Run fine-tuning\n", + "7. Inspect results\n", + "8. Download checkpoint and logs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "gpu-check", + "metadata": {}, + "outputs": [], + "source": [ + "# ── 1. GPU check ────────────────────────────────────────────────────────────\n", + "import torch\n", + "if torch.cuda.is_available():\n", + " print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n", + " print(f\"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB\")\n", + "else:\n", + " print(\"No GPU found — training will be slow on CPU.\")\n", + " print(\"Go to Runtime → Change runtime type → T4 GPU and re-run.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "upload-zip", + "metadata": {}, + "outputs": [], + "source": [ + "# ── 2. Upload hamori_colab_finetune.zip ──────────────────────────────────────\n", + "# Build it locally first:\n", + "# python scripts/make_colab_zip.py --mode finetune\n", + "from google.colab import files\n", + "uploaded = files.upload() # select hamori_colab_finetune.zip\n", + "print(\"Uploaded:\", list(uploaded.keys()))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "extract", + "metadata": {}, + "outputs": [], + "source": [ + "# ── 3. Extract and install dependencies ─────────────────────────────────────\n", + "import zipfile, os\n", + "\n", + "WORK_DIR = \"/content/hamori\"\n", + "os.makedirs(WORK_DIR, exist_ok=True)\n", + "\n", + "zip_name = [k for k in uploaded if k.endswith(\".zip\")][0]\n", + "with zipfile.ZipFile(zip_name) as zf:\n", + " zf.extractall(WORK_DIR)\n", + " print(f\"Extracted {len(zf.namelist())} files to {WORK_DIR}\")\n", + "\n", + "os.chdir(WORK_DIR)\n", + "print(\"Working directory:\", os.getcwd())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "install-deps", + "metadata": {}, + "outputs": [], + "source": [ + "# Colab ships torch; only install the extra deps\n", + "!pip install -q pretty_midi mido music21 matplotlib" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "upload-checkpoint", + "metadata": {}, + "outputs": [], + "source": [ + "# ── 4. Upload pretrained checkpoint ─────────────────────────────────────────\n", + "# Skip this cell if you built the zip with --include-checkpoint.\n", + "import os\n", + "from pathlib import Path\n", + "from google.colab import files\n", + "\n", + "ckpt_path = Path(\"checkpoints/pretrained.pt\")\n", + "if ckpt_path.exists():\n", + " print(f\"Checkpoint already present: {ckpt_path} ({ckpt_path.stat().st_size / 1e6:.1f} MB)\")\n", + "else:\n", + " print(\"Upload checkpoints/pretrained.pt from your local machine.\")\n", + " uploaded_ckpt = files.upload() # select pretrained.pt\n", + " ckpt_path.parent.mkdir(parents=True, exist_ok=True)\n", + " src = list(uploaded_ckpt.keys())[0]\n", + " os.rename(src, ckpt_path)\n", + " print(f\"Saved to {ckpt_path} ({ckpt_path.stat().st_size / 1e6:.1f} MB)\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "verify-data", + "metadata": {}, + "outputs": [], + "source": [ + "# ── 5. Verify processed user corpus ─────────────────────────────────────────\n", + "from pathlib import Path\n", + "train_pt = list(Path(\"data/processed/user/train\").glob(\"*.pt\"))\n", + "val_pt = list(Path(\"data/processed/user/val\").glob(\"*.pt\"))\n", + "print(f\"Train: {len(train_pt)} files\")\n", + "print(f\"Val: {len(val_pt)} files\")\n", + "if not train_pt:\n", + " print()\n", + " print(\"ERROR: no training data found.\")\n", + " print(\"Run locally first: python scripts/prepare_data.py \")\n", + " print(\" --input-dir data/raw_user --output-dir data/processed/user\")\n", + " print(\"Then rebuild the zip: python scripts/make_colab_zip.py --mode finetune\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "finetune", + "metadata": {}, + "outputs": [], + "source": [ + "# ── 6. Fine-tune ─────────────────────────────────────────────────────────────\n", + "# Outputs:\n", + "# checkpoints/finetuned.pt\n", + "# checkpoints/finetuned.log.csv\n", + "# checkpoints/finetuned_curves.png\n", + "# checkpoints/finetuned.report.txt\n", + "!python scripts/train.py" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "show-report", + "metadata": {}, + "outputs": [], + "source": [ + "# ── 7a. Show report ───────────────────────────────────────────────────────────\n", + "from pathlib import Path\n", + "report = Path(\"checkpoints/finetuned.report.txt\")\n", + "if report.exists():\n", + " print(report.read_text(encoding=\"utf-8\"))\n", + "else:\n", + " print(\"Report not found — training may have failed.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "show-curves", + "metadata": {}, + "outputs": [], + "source": [ + "# ── 7b. Show loss curves ─────────────────────────────────────────────────────\n", + "from IPython.display import Image\n", + "Image(\"checkpoints/finetuned_curves.png\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "download", + "metadata": {}, + "outputs": [], + "source": [ + "# ── 8. Download results ───────────────────────────────────────────────────────\n", + "import shutil\n", + "from google.colab import files\n", + "\n", + "shutil.make_archive(\"/content/finetune_results\", \"zip\", WORK_DIR, \"checkpoints\")\n", + "files.download(\"/content/finetune_results.zip\")" + ] + } + ] +} diff --git a/scripts/make_colab_zip.py b/scripts/make_colab_zip.py index bea0163..755d708 100644 --- a/scripts/make_colab_zip.py +++ b/scripts/make_colab_zip.py @@ -1,23 +1,40 @@ -"""Package the hamori project for Google Colab pre-training. +"""Package the hamori project for Google Colab pre-training or fine-tuning. -Creates hamori_colab.zip containing: +pretrain mode (default): - src/ (all Python modules) - - scripts/pretrain.py (pre-training script) + - scripts/pretrain.py - requirements.txt - data/processed/mcgill/train/*.pt (remapped from data/processed/train/) - data/processed/mcgill/val/*.pt (remapped from data/processed/val/) -The local processed data lives at data/processed/{train,val}/ but pretrain.py -expects data/processed/mcgill/{train,val}/. This script remaps the paths -inside the zip so no code changes are needed on Colab. +finetune mode: + - src/ (all Python modules) + - scripts/train.py + - requirements.txt + - data/processed/user/train/*.pt + - data/processed/user/val/*.pt + - checkpoints/pretrained.pt (only with --include-checkpoint) Usage: + # Pre-training bundle (default) python scripts/make_colab_zip.py - python scripts/make_colab_zip.py --output my_bundle.zip - python scripts/make_colab_zip.py --no-data # skip .pt files (code only) + python scripts/make_colab_zip.py --mode pretrain -Output: - hamori_colab.zip (in project root by default) + # Fine-tuning bundle (run prepare_data.py locally first) + python scripts/make_colab_zip.py --mode finetune + + # Fine-tuning bundle with pretrained checkpoint included + python scripts/make_colab_zip.py --mode finetune --include-checkpoint + + # Exclude data files (code only) + python scripts/make_colab_zip.py --mode finetune --no-data + + # Custom output path + python scripts/make_colab_zip.py --mode pretrain --output my_bundle.zip + +Outputs: + output/hamori_colab_pretrain.zip (pretrain mode) + output/hamori_colab_finetune.zip (finetune mode) """ from __future__ import annotations @@ -28,28 +45,37 @@ import zipfile from pathlib import Path ROOT = Path(__file__).resolve().parent.parent - -DEFAULT_OUT = ROOT / "hamori_colab.zip" - -# Files/dirs to include verbatim (paths relative to ROOT) -VERBATIM: list[str] = [ - "requirements.txt", - "scripts/pretrain.py", -] - SRC_DIR = ROOT / "src" +OUT_DIR = ROOT / "output" -# Local data dirs → path inside zip -DATA_REMAP: list[tuple[Path, str]] = [ +COMMON_VERBATIM: list[str] = ["requirements.txt"] + +MODE_SCRIPTS: dict[str, list[str]] = { + "pretrain": ["scripts/pretrain.py"], + "finetune": ["scripts/train.py"], +} + +# Local dir → arc path inside zip +PRETRAIN_DATA: list[tuple[Path, str]] = [ (ROOT / "data" / "processed" / "train", "data/processed/mcgill/train"), (ROOT / "data" / "processed" / "val", "data/processed/mcgill/val"), ] +FINETUNE_DATA: list[tuple[Path, str]] = [ + (ROOT / "data" / "processed" / "user" / "train", "data/processed/user/train"), + (ROOT / "data" / "processed" / "user" / "val", "data/processed/user/val"), +] -def build_zip(out_path: Path, include_data: bool) -> None: +PRETRAINED_CKPT = ROOT / "checkpoints" / "pretrained.pt" + + +def build_zip(out_path: Path, mode: str, include_data: bool, + include_checkpoint: bool) -> None: out_path.parent.mkdir(parents=True, exist_ok=True) n_files = 0 + data_map = PRETRAIN_DATA if mode == "pretrain" else FINETUNE_DATA + with zipfile.ZipFile(out_path, "w", compression=zipfile.ZIP_DEFLATED) as zf: # src/ — all .py files for py in sorted(SRC_DIR.rglob("*.py")): @@ -57,8 +83,8 @@ def build_zip(out_path: Path, include_data: bool) -> None: zf.write(py, arc) n_files += 1 - # verbatim files - for rel in VERBATIM: + # common + mode-specific scripts + for rel in COMMON_VERBATIM + MODE_SCRIPTS[mode]: src = ROOT / rel if not src.exists(): print(f"[warn] missing: {src} — skipped", file=sys.stderr) @@ -66,22 +92,32 @@ def build_zip(out_path: Path, include_data: bool) -> None: zf.write(src, rel) n_files += 1 - # data files with path remapping + # data if include_data: - for local_dir, arc_prefix in DATA_REMAP: + for local_dir, arc_prefix in data_map: if not local_dir.exists(): print(f"[warn] data dir not found: {local_dir} — skipped", file=sys.stderr) continue pts = sorted(local_dir.glob("*.pt")) for pt in pts: - arc = f"{arc_prefix}/{pt.name}" - zf.write(pt, arc) + zf.write(pt, f"{arc_prefix}/{pt.name}") n_files += 1 print(f"[data] {arc_prefix}: {len(pts)} files") else: print("[data] skipped (--no-data)") + # pretrained checkpoint (finetune mode only, opt-in) + if mode == "finetune" and include_checkpoint: + if not PRETRAINED_CKPT.exists(): + print(f"[warn] checkpoint not found: {PRETRAINED_CKPT} — skipped", + file=sys.stderr) + else: + zf.write(PRETRAINED_CKPT, "checkpoints/pretrained.pt") + n_files += 1 + ckpt_mb = PRETRAINED_CKPT.stat().st_size / 1_048_576 + print(f"[ckpt] checkpoints/pretrained.pt ({ckpt_mb:.1f} MB)") + size_mb = out_path.stat().st_size / 1_048_576 print(f"[done] {out_path} ({n_files} files, {size_mb:.1f} MB)") @@ -91,13 +127,31 @@ def main() -> None: description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter, ) - ap.add_argument("--output", type=Path, default=DEFAULT_OUT, - help="Output zip path (default: hamori_colab.zip)") - ap.add_argument("--no-data", action="store_true", - help="Exclude .pt data files (bundle code only).") + ap.add_argument( + "--mode", choices=["pretrain", "finetune"], default="pretrain", + help="Bundle mode: 'pretrain' (default) or 'finetune'.", + ) + ap.add_argument( + "--output", type=Path, default=None, + help="Output zip path. Default: output/hamori_colab_.zip.", + ) + ap.add_argument( + "--no-data", action="store_true", + help="Exclude data files (code-only bundle).", + ) + ap.add_argument( + "--include-checkpoint", action="store_true", dest="include_checkpoint", + help="(finetune mode) Include checkpoints/pretrained.pt in the zip.", + ) args = ap.parse_args() - build_zip(args.output, include_data=not args.no_data) + out_path = args.output or OUT_DIR / f"hamori_colab_{args.mode}.zip" + build_zip( + out_path=out_path, + mode=args.mode, + include_data=not args.no_data, + include_checkpoint=args.include_checkpoint, + ) if __name__ == "__main__":