scripts: add --mode finetune to make_colab_zip, add colab_finetune notebook
make_colab_zip.py now accepts --mode pretrain|finetune (default: pretrain).
Finetune mode bundles scripts/train.py + data/processed/user/{train,val}/*.pt
plus an optional --include-checkpoint flag for pretrained.pt.
notebooks/colab_finetune.ipynb covers the full Colab fine-tuning workflow:
upload zip → upload pretrained.pt → verify data → train → inspect → download.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,212 @@
|
|||||||
|
{
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5,
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"name": "python",
|
||||||
|
"version": "3.11.0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "title",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# hamori — fine-tuning on personal chord corpus\n",
|
||||||
|
"\n",
|
||||||
|
"This notebook fine-tunes a pre-trained ChordTransformer on your tokenized `.pt` files using Google Colab (GPU T4 recommended).\n",
|
||||||
|
"\n",
|
||||||
|
"**Prerequisites (done locally before uploading):**\n",
|
||||||
|
"- `python scripts/prepare_data.py --input-dir data/raw_user --output-dir data/processed/user`\n",
|
||||||
|
"- `python scripts/make_colab_zip.py --mode finetune`\n",
|
||||||
|
"- Have `checkpoints/pretrained.pt` from a completed pre-training run.\n",
|
||||||
|
"\n",
|
||||||
|
"**Steps:**\n",
|
||||||
|
"1. Check GPU\n",
|
||||||
|
"2. Upload `hamori_colab_finetune.zip`\n",
|
||||||
|
"3. Extract and install dependencies\n",
|
||||||
|
"4. Upload `pretrained.pt` checkpoint\n",
|
||||||
|
"5. Verify processed data\n",
|
||||||
|
"6. Run fine-tuning\n",
|
||||||
|
"7. Inspect results\n",
|
||||||
|
"8. Download checkpoint and logs"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "gpu-check",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# ── 1. GPU check ────────────────────────────────────────────────────────────\n",
|
||||||
|
"import torch\n",
|
||||||
|
"if torch.cuda.is_available():\n",
|
||||||
|
" print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n",
|
||||||
|
" print(f\"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB\")\n",
|
||||||
|
"else:\n",
|
||||||
|
" print(\"No GPU found — training will be slow on CPU.\")\n",
|
||||||
|
" print(\"Go to Runtime → Change runtime type → T4 GPU and re-run.\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "upload-zip",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# ── 2. Upload hamori_colab_finetune.zip ──────────────────────────────────────\n",
|
||||||
|
"# Build it locally first:\n",
|
||||||
|
"# python scripts/make_colab_zip.py --mode finetune\n",
|
||||||
|
"from google.colab import files\n",
|
||||||
|
"uploaded = files.upload() # select hamori_colab_finetune.zip\n",
|
||||||
|
"print(\"Uploaded:\", list(uploaded.keys()))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "extract",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# ── 3. Extract and install dependencies ─────────────────────────────────────\n",
|
||||||
|
"import zipfile, os\n",
|
||||||
|
"\n",
|
||||||
|
"WORK_DIR = \"/content/hamori\"\n",
|
||||||
|
"os.makedirs(WORK_DIR, exist_ok=True)\n",
|
||||||
|
"\n",
|
||||||
|
"zip_name = [k for k in uploaded if k.endswith(\".zip\")][0]\n",
|
||||||
|
"with zipfile.ZipFile(zip_name) as zf:\n",
|
||||||
|
" zf.extractall(WORK_DIR)\n",
|
||||||
|
" print(f\"Extracted {len(zf.namelist())} files to {WORK_DIR}\")\n",
|
||||||
|
"\n",
|
||||||
|
"os.chdir(WORK_DIR)\n",
|
||||||
|
"print(\"Working directory:\", os.getcwd())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "install-deps",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Colab ships torch; only install the extra deps\n",
|
||||||
|
"!pip install -q pretty_midi mido music21 matplotlib"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "upload-checkpoint",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# ── 4. Upload pretrained checkpoint ─────────────────────────────────────────\n",
|
||||||
|
"# Skip this cell if you built the zip with --include-checkpoint.\n",
|
||||||
|
"import os\n",
|
||||||
|
"from pathlib import Path\n",
|
||||||
|
"from google.colab import files\n",
|
||||||
|
"\n",
|
||||||
|
"ckpt_path = Path(\"checkpoints/pretrained.pt\")\n",
|
||||||
|
"if ckpt_path.exists():\n",
|
||||||
|
" print(f\"Checkpoint already present: {ckpt_path} ({ckpt_path.stat().st_size / 1e6:.1f} MB)\")\n",
|
||||||
|
"else:\n",
|
||||||
|
" print(\"Upload checkpoints/pretrained.pt from your local machine.\")\n",
|
||||||
|
" uploaded_ckpt = files.upload() # select pretrained.pt\n",
|
||||||
|
" ckpt_path.parent.mkdir(parents=True, exist_ok=True)\n",
|
||||||
|
" src = list(uploaded_ckpt.keys())[0]\n",
|
||||||
|
" os.rename(src, ckpt_path)\n",
|
||||||
|
" print(f\"Saved to {ckpt_path} ({ckpt_path.stat().st_size / 1e6:.1f} MB)\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "verify-data",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# ── 5. Verify processed user corpus ─────────────────────────────────────────\n",
|
||||||
|
"from pathlib import Path\n",
|
||||||
|
"train_pt = list(Path(\"data/processed/user/train\").glob(\"*.pt\"))\n",
|
||||||
|
"val_pt = list(Path(\"data/processed/user/val\").glob(\"*.pt\"))\n",
|
||||||
|
"print(f\"Train: {len(train_pt)} files\")\n",
|
||||||
|
"print(f\"Val: {len(val_pt)} files\")\n",
|
||||||
|
"if not train_pt:\n",
|
||||||
|
" print()\n",
|
||||||
|
" print(\"ERROR: no training data found.\")\n",
|
||||||
|
" print(\"Run locally first: python scripts/prepare_data.py \")\n",
|
||||||
|
" print(\" --input-dir data/raw_user --output-dir data/processed/user\")\n",
|
||||||
|
" print(\"Then rebuild the zip: python scripts/make_colab_zip.py --mode finetune\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "finetune",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# ── 6. Fine-tune ─────────────────────────────────────────────────────────────\n",
|
||||||
|
"# Outputs:\n",
|
||||||
|
"# checkpoints/finetuned.pt\n",
|
||||||
|
"# checkpoints/finetuned.log.csv\n",
|
||||||
|
"# checkpoints/finetuned_curves.png\n",
|
||||||
|
"# checkpoints/finetuned.report.txt\n",
|
||||||
|
"!python scripts/train.py"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "show-report",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# ── 7a. Show report ───────────────────────────────────────────────────────────\n",
|
||||||
|
"from pathlib import Path\n",
|
||||||
|
"report = Path(\"checkpoints/finetuned.report.txt\")\n",
|
||||||
|
"if report.exists():\n",
|
||||||
|
" print(report.read_text(encoding=\"utf-8\"))\n",
|
||||||
|
"else:\n",
|
||||||
|
" print(\"Report not found — training may have failed.\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "show-curves",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# ── 7b. Show loss curves ─────────────────────────────────────────────────────\n",
|
||||||
|
"from IPython.display import Image\n",
|
||||||
|
"Image(\"checkpoints/finetuned_curves.png\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "download",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# ── 8. Download results ───────────────────────────────────────────────────────\n",
|
||||||
|
"import shutil\n",
|
||||||
|
"from google.colab import files\n",
|
||||||
|
"\n",
|
||||||
|
"shutil.make_archive(\"/content/finetune_results\", \"zip\", WORK_DIR, \"checkpoints\")\n",
|
||||||
|
"files.download(\"/content/finetune_results.zip\")"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
+87
-33
@@ -1,23 +1,40 @@
|
|||||||
"""Package the hamori project for Google Colab pre-training.
|
"""Package the hamori project for Google Colab pre-training or fine-tuning.
|
||||||
|
|
||||||
Creates hamori_colab.zip containing:
|
pretrain mode (default):
|
||||||
- src/ (all Python modules)
|
- src/ (all Python modules)
|
||||||
- scripts/pretrain.py (pre-training script)
|
- scripts/pretrain.py
|
||||||
- requirements.txt
|
- requirements.txt
|
||||||
- data/processed/mcgill/train/*.pt (remapped from data/processed/train/)
|
- data/processed/mcgill/train/*.pt (remapped from data/processed/train/)
|
||||||
- data/processed/mcgill/val/*.pt (remapped from data/processed/val/)
|
- data/processed/mcgill/val/*.pt (remapped from data/processed/val/)
|
||||||
|
|
||||||
The local processed data lives at data/processed/{train,val}/ but pretrain.py
|
finetune mode:
|
||||||
expects data/processed/mcgill/{train,val}/. This script remaps the paths
|
- src/ (all Python modules)
|
||||||
inside the zip so no code changes are needed on Colab.
|
- scripts/train.py
|
||||||
|
- requirements.txt
|
||||||
|
- data/processed/user/train/*.pt
|
||||||
|
- data/processed/user/val/*.pt
|
||||||
|
- checkpoints/pretrained.pt (only with --include-checkpoint)
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
|
# Pre-training bundle (default)
|
||||||
python scripts/make_colab_zip.py
|
python scripts/make_colab_zip.py
|
||||||
python scripts/make_colab_zip.py --output my_bundle.zip
|
python scripts/make_colab_zip.py --mode pretrain
|
||||||
python scripts/make_colab_zip.py --no-data # skip .pt files (code only)
|
|
||||||
|
|
||||||
Output:
|
# Fine-tuning bundle (run prepare_data.py locally first)
|
||||||
hamori_colab.zip (in project root by default)
|
python scripts/make_colab_zip.py --mode finetune
|
||||||
|
|
||||||
|
# Fine-tuning bundle with pretrained checkpoint included
|
||||||
|
python scripts/make_colab_zip.py --mode finetune --include-checkpoint
|
||||||
|
|
||||||
|
# Exclude data files (code only)
|
||||||
|
python scripts/make_colab_zip.py --mode finetune --no-data
|
||||||
|
|
||||||
|
# Custom output path
|
||||||
|
python scripts/make_colab_zip.py --mode pretrain --output my_bundle.zip
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
output/hamori_colab_pretrain.zip (pretrain mode)
|
||||||
|
output/hamori_colab_finetune.zip (finetune mode)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
@@ -28,28 +45,37 @@ import zipfile
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
ROOT = Path(__file__).resolve().parent.parent
|
ROOT = Path(__file__).resolve().parent.parent
|
||||||
|
|
||||||
DEFAULT_OUT = ROOT / "hamori_colab.zip"
|
|
||||||
|
|
||||||
# Files/dirs to include verbatim (paths relative to ROOT)
|
|
||||||
VERBATIM: list[str] = [
|
|
||||||
"requirements.txt",
|
|
||||||
"scripts/pretrain.py",
|
|
||||||
]
|
|
||||||
|
|
||||||
SRC_DIR = ROOT / "src"
|
SRC_DIR = ROOT / "src"
|
||||||
|
OUT_DIR = ROOT / "output"
|
||||||
|
|
||||||
# Local data dirs → path inside zip
|
COMMON_VERBATIM: list[str] = ["requirements.txt"]
|
||||||
DATA_REMAP: list[tuple[Path, str]] = [
|
|
||||||
|
MODE_SCRIPTS: dict[str, list[str]] = {
|
||||||
|
"pretrain": ["scripts/pretrain.py"],
|
||||||
|
"finetune": ["scripts/train.py"],
|
||||||
|
}
|
||||||
|
|
||||||
|
# Local dir → arc path inside zip
|
||||||
|
PRETRAIN_DATA: list[tuple[Path, str]] = [
|
||||||
(ROOT / "data" / "processed" / "train", "data/processed/mcgill/train"),
|
(ROOT / "data" / "processed" / "train", "data/processed/mcgill/train"),
|
||||||
(ROOT / "data" / "processed" / "val", "data/processed/mcgill/val"),
|
(ROOT / "data" / "processed" / "val", "data/processed/mcgill/val"),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
FINETUNE_DATA: list[tuple[Path, str]] = [
|
||||||
|
(ROOT / "data" / "processed" / "user" / "train", "data/processed/user/train"),
|
||||||
|
(ROOT / "data" / "processed" / "user" / "val", "data/processed/user/val"),
|
||||||
|
]
|
||||||
|
|
||||||
def build_zip(out_path: Path, include_data: bool) -> None:
|
PRETRAINED_CKPT = ROOT / "checkpoints" / "pretrained.pt"
|
||||||
|
|
||||||
|
|
||||||
|
def build_zip(out_path: Path, mode: str, include_data: bool,
|
||||||
|
include_checkpoint: bool) -> None:
|
||||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
n_files = 0
|
n_files = 0
|
||||||
|
|
||||||
|
data_map = PRETRAIN_DATA if mode == "pretrain" else FINETUNE_DATA
|
||||||
|
|
||||||
with zipfile.ZipFile(out_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
|
with zipfile.ZipFile(out_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
|
||||||
# src/ — all .py files
|
# src/ — all .py files
|
||||||
for py in sorted(SRC_DIR.rglob("*.py")):
|
for py in sorted(SRC_DIR.rglob("*.py")):
|
||||||
@@ -57,8 +83,8 @@ def build_zip(out_path: Path, include_data: bool) -> None:
|
|||||||
zf.write(py, arc)
|
zf.write(py, arc)
|
||||||
n_files += 1
|
n_files += 1
|
||||||
|
|
||||||
# verbatim files
|
# common + mode-specific scripts
|
||||||
for rel in VERBATIM:
|
for rel in COMMON_VERBATIM + MODE_SCRIPTS[mode]:
|
||||||
src = ROOT / rel
|
src = ROOT / rel
|
||||||
if not src.exists():
|
if not src.exists():
|
||||||
print(f"[warn] missing: {src} — skipped", file=sys.stderr)
|
print(f"[warn] missing: {src} — skipped", file=sys.stderr)
|
||||||
@@ -66,22 +92,32 @@ def build_zip(out_path: Path, include_data: bool) -> None:
|
|||||||
zf.write(src, rel)
|
zf.write(src, rel)
|
||||||
n_files += 1
|
n_files += 1
|
||||||
|
|
||||||
# data files with path remapping
|
# data
|
||||||
if include_data:
|
if include_data:
|
||||||
for local_dir, arc_prefix in DATA_REMAP:
|
for local_dir, arc_prefix in data_map:
|
||||||
if not local_dir.exists():
|
if not local_dir.exists():
|
||||||
print(f"[warn] data dir not found: {local_dir} — skipped",
|
print(f"[warn] data dir not found: {local_dir} — skipped",
|
||||||
file=sys.stderr)
|
file=sys.stderr)
|
||||||
continue
|
continue
|
||||||
pts = sorted(local_dir.glob("*.pt"))
|
pts = sorted(local_dir.glob("*.pt"))
|
||||||
for pt in pts:
|
for pt in pts:
|
||||||
arc = f"{arc_prefix}/{pt.name}"
|
zf.write(pt, f"{arc_prefix}/{pt.name}")
|
||||||
zf.write(pt, arc)
|
|
||||||
n_files += 1
|
n_files += 1
|
||||||
print(f"[data] {arc_prefix}: {len(pts)} files")
|
print(f"[data] {arc_prefix}: {len(pts)} files")
|
||||||
else:
|
else:
|
||||||
print("[data] skipped (--no-data)")
|
print("[data] skipped (--no-data)")
|
||||||
|
|
||||||
|
# pretrained checkpoint (finetune mode only, opt-in)
|
||||||
|
if mode == "finetune" and include_checkpoint:
|
||||||
|
if not PRETRAINED_CKPT.exists():
|
||||||
|
print(f"[warn] checkpoint not found: {PRETRAINED_CKPT} — skipped",
|
||||||
|
file=sys.stderr)
|
||||||
|
else:
|
||||||
|
zf.write(PRETRAINED_CKPT, "checkpoints/pretrained.pt")
|
||||||
|
n_files += 1
|
||||||
|
ckpt_mb = PRETRAINED_CKPT.stat().st_size / 1_048_576
|
||||||
|
print(f"[ckpt] checkpoints/pretrained.pt ({ckpt_mb:.1f} MB)")
|
||||||
|
|
||||||
size_mb = out_path.stat().st_size / 1_048_576
|
size_mb = out_path.stat().st_size / 1_048_576
|
||||||
print(f"[done] {out_path} ({n_files} files, {size_mb:.1f} MB)")
|
print(f"[done] {out_path} ({n_files} files, {size_mb:.1f} MB)")
|
||||||
|
|
||||||
@@ -91,13 +127,31 @@ def main() -> None:
|
|||||||
description=__doc__,
|
description=__doc__,
|
||||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
)
|
)
|
||||||
ap.add_argument("--output", type=Path, default=DEFAULT_OUT,
|
ap.add_argument(
|
||||||
help="Output zip path (default: hamori_colab.zip)")
|
"--mode", choices=["pretrain", "finetune"], default="pretrain",
|
||||||
ap.add_argument("--no-data", action="store_true",
|
help="Bundle mode: 'pretrain' (default) or 'finetune'.",
|
||||||
help="Exclude .pt data files (bundle code only).")
|
)
|
||||||
|
ap.add_argument(
|
||||||
|
"--output", type=Path, default=None,
|
||||||
|
help="Output zip path. Default: output/hamori_colab_<mode>.zip.",
|
||||||
|
)
|
||||||
|
ap.add_argument(
|
||||||
|
"--no-data", action="store_true",
|
||||||
|
help="Exclude data files (code-only bundle).",
|
||||||
|
)
|
||||||
|
ap.add_argument(
|
||||||
|
"--include-checkpoint", action="store_true", dest="include_checkpoint",
|
||||||
|
help="(finetune mode) Include checkpoints/pretrained.pt in the zip.",
|
||||||
|
)
|
||||||
args = ap.parse_args()
|
args = ap.parse_args()
|
||||||
|
|
||||||
build_zip(args.output, include_data=not args.no_data)
|
out_path = args.output or OUT_DIR / f"hamori_colab_{args.mode}.zip"
|
||||||
|
build_zip(
|
||||||
|
out_path=out_path,
|
||||||
|
mode=args.mode,
|
||||||
|
include_data=not args.no_data,
|
||||||
|
include_checkpoint=args.include_checkpoint,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
Reference in New Issue
Block a user