8f657ca916
make_colab_zip.py now accepts --mode pretrain|finetune (default: pretrain).
Finetune mode bundles scripts/train.py + data/processed/user/{train,val}/*.pt
plus an optional --include-checkpoint flag for pretrained.pt.
notebooks/colab_finetune.ipynb covers the full Colab fine-tuning workflow:
upload zip → upload pretrained.pt → verify data → train → inspect → download.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
159 lines
5.3 KiB
Python
159 lines
5.3 KiB
Python
"""Package the hamori project for Google Colab pre-training or fine-tuning.
|
|
|
|
pretrain mode (default):
|
|
- src/ (all Python modules)
|
|
- scripts/pretrain.py
|
|
- requirements.txt
|
|
- data/processed/mcgill/train/*.pt (remapped from data/processed/train/)
|
|
- data/processed/mcgill/val/*.pt (remapped from data/processed/val/)
|
|
|
|
finetune mode:
|
|
- src/ (all Python modules)
|
|
- scripts/train.py
|
|
- requirements.txt
|
|
- data/processed/user/train/*.pt
|
|
- data/processed/user/val/*.pt
|
|
- checkpoints/pretrained.pt (only with --include-checkpoint)
|
|
|
|
Usage:
|
|
# Pre-training bundle (default)
|
|
python scripts/make_colab_zip.py
|
|
python scripts/make_colab_zip.py --mode pretrain
|
|
|
|
# Fine-tuning bundle (run prepare_data.py locally first)
|
|
python scripts/make_colab_zip.py --mode finetune
|
|
|
|
# Fine-tuning bundle with pretrained checkpoint included
|
|
python scripts/make_colab_zip.py --mode finetune --include-checkpoint
|
|
|
|
# Exclude data files (code only)
|
|
python scripts/make_colab_zip.py --mode finetune --no-data
|
|
|
|
# Custom output path
|
|
python scripts/make_colab_zip.py --mode pretrain --output my_bundle.zip
|
|
|
|
Outputs:
|
|
output/hamori_colab_pretrain.zip (pretrain mode)
|
|
output/hamori_colab_finetune.zip (finetune mode)
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import sys
|
|
import zipfile
|
|
from pathlib import Path
|
|
|
|
ROOT = Path(__file__).resolve().parent.parent
|
|
SRC_DIR = ROOT / "src"
|
|
OUT_DIR = ROOT / "output"
|
|
|
|
COMMON_VERBATIM: list[str] = ["requirements.txt"]
|
|
|
|
MODE_SCRIPTS: dict[str, list[str]] = {
|
|
"pretrain": ["scripts/pretrain.py"],
|
|
"finetune": ["scripts/train.py"],
|
|
}
|
|
|
|
# Local dir → arc path inside zip
|
|
PRETRAIN_DATA: list[tuple[Path, str]] = [
|
|
(ROOT / "data" / "processed" / "train", "data/processed/mcgill/train"),
|
|
(ROOT / "data" / "processed" / "val", "data/processed/mcgill/val"),
|
|
]
|
|
|
|
FINETUNE_DATA: list[tuple[Path, str]] = [
|
|
(ROOT / "data" / "processed" / "user" / "train", "data/processed/user/train"),
|
|
(ROOT / "data" / "processed" / "user" / "val", "data/processed/user/val"),
|
|
]
|
|
|
|
PRETRAINED_CKPT = ROOT / "checkpoints" / "pretrained.pt"
|
|
|
|
|
|
def build_zip(out_path: Path, mode: str, include_data: bool,
|
|
include_checkpoint: bool) -> None:
|
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
n_files = 0
|
|
|
|
data_map = PRETRAIN_DATA if mode == "pretrain" else FINETUNE_DATA
|
|
|
|
with zipfile.ZipFile(out_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
|
|
# src/ — all .py files
|
|
for py in sorted(SRC_DIR.rglob("*.py")):
|
|
arc = "src/" + py.relative_to(SRC_DIR).as_posix()
|
|
zf.write(py, arc)
|
|
n_files += 1
|
|
|
|
# common + mode-specific scripts
|
|
for rel in COMMON_VERBATIM + MODE_SCRIPTS[mode]:
|
|
src = ROOT / rel
|
|
if not src.exists():
|
|
print(f"[warn] missing: {src} — skipped", file=sys.stderr)
|
|
continue
|
|
zf.write(src, rel)
|
|
n_files += 1
|
|
|
|
# data
|
|
if include_data:
|
|
for local_dir, arc_prefix in data_map:
|
|
if not local_dir.exists():
|
|
print(f"[warn] data dir not found: {local_dir} — skipped",
|
|
file=sys.stderr)
|
|
continue
|
|
pts = sorted(local_dir.glob("*.pt"))
|
|
for pt in pts:
|
|
zf.write(pt, f"{arc_prefix}/{pt.name}")
|
|
n_files += 1
|
|
print(f"[data] {arc_prefix}: {len(pts)} files")
|
|
else:
|
|
print("[data] skipped (--no-data)")
|
|
|
|
# pretrained checkpoint (finetune mode only, opt-in)
|
|
if mode == "finetune" and include_checkpoint:
|
|
if not PRETRAINED_CKPT.exists():
|
|
print(f"[warn] checkpoint not found: {PRETRAINED_CKPT} — skipped",
|
|
file=sys.stderr)
|
|
else:
|
|
zf.write(PRETRAINED_CKPT, "checkpoints/pretrained.pt")
|
|
n_files += 1
|
|
ckpt_mb = PRETRAINED_CKPT.stat().st_size / 1_048_576
|
|
print(f"[ckpt] checkpoints/pretrained.pt ({ckpt_mb:.1f} MB)")
|
|
|
|
size_mb = out_path.stat().st_size / 1_048_576
|
|
print(f"[done] {out_path} ({n_files} files, {size_mb:.1f} MB)")
|
|
|
|
|
|
def main() -> None:
|
|
ap = argparse.ArgumentParser(
|
|
description=__doc__,
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
)
|
|
ap.add_argument(
|
|
"--mode", choices=["pretrain", "finetune"], default="pretrain",
|
|
help="Bundle mode: 'pretrain' (default) or 'finetune'.",
|
|
)
|
|
ap.add_argument(
|
|
"--output", type=Path, default=None,
|
|
help="Output zip path. Default: output/hamori_colab_<mode>.zip.",
|
|
)
|
|
ap.add_argument(
|
|
"--no-data", action="store_true",
|
|
help="Exclude data files (code-only bundle).",
|
|
)
|
|
ap.add_argument(
|
|
"--include-checkpoint", action="store_true", dest="include_checkpoint",
|
|
help="(finetune mode) Include checkpoints/pretrained.pt in the zip.",
|
|
)
|
|
args = ap.parse_args()
|
|
|
|
out_path = args.output or OUT_DIR / f"hamori_colab_{args.mode}.zip"
|
|
build_zip(
|
|
out_path=out_path,
|
|
mode=args.mode,
|
|
include_data=not args.no_data,
|
|
include_checkpoint=args.include_checkpoint,
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|