"""Package the hamori project for Google Colab pre-training or fine-tuning. pretrain mode (default): - src/ (all Python modules) - scripts/pretrain.py - requirements.txt - data/processed/mcgill/train/*.pt - data/processed/mcgill/val/*.pt finetune mode: - src/ (all Python modules) - scripts/train.py - requirements.txt - data/processed/user/train/*.pt - data/processed/user/val/*.pt - checkpoints/pretrained.pt (only with --include-checkpoint) Usage: # Pre-training bundle (default) python scripts/make_colab_zip.py python scripts/make_colab_zip.py --mode pretrain # Fine-tuning bundle (run prepare_data.py locally first) python scripts/make_colab_zip.py --mode finetune # Fine-tuning bundle with pretrained checkpoint included python scripts/make_colab_zip.py --mode finetune --include-checkpoint # Exclude data files (code only) python scripts/make_colab_zip.py --mode finetune --no-data # Custom output path python scripts/make_colab_zip.py --mode pretrain --output my_bundle.zip Outputs: output/hamori_colab_pretrain.zip (pretrain mode) output/hamori_colab_finetune.zip (finetune mode) """ from __future__ import annotations import argparse import sys import zipfile from pathlib import Path ROOT = Path(__file__).resolve().parent.parent SRC_DIR = ROOT / "src" OUT_DIR = ROOT / "output" COMMON_VERBATIM: list[str] = ["requirements.txt"] MODE_SCRIPTS: dict[str, list[str]] = { "pretrain": ["scripts/pretrain.py"], "finetune": ["scripts/train.py"], } # Local dir → arc path inside zip PRETRAIN_DATA: list[tuple[Path, str]] = [ (ROOT / "data" / "processed" / "mcgill" / "train", "data/processed/mcgill/train"), (ROOT / "data" / "processed" / "mcgill" / "val", "data/processed/mcgill/val"), ] FINETUNE_DATA: list[tuple[Path, str]] = [ (ROOT / "data" / "processed" / "user" / "train", "data/processed/user/train"), (ROOT / "data" / "processed" / "user" / "val", "data/processed/user/val"), ] PRETRAINED_CKPT = ROOT / "checkpoints" / "pretrained.pt" def build_zip(out_path: Path, mode: str, include_data: bool, include_checkpoint: bool) -> None: out_path.parent.mkdir(parents=True, exist_ok=True) n_files = 0 data_map = PRETRAIN_DATA if mode == "pretrain" else FINETUNE_DATA with zipfile.ZipFile(out_path, "w", compression=zipfile.ZIP_DEFLATED) as zf: # src/ — all .py files for py in sorted(SRC_DIR.rglob("*.py")): arc = "src/" + py.relative_to(SRC_DIR).as_posix() zf.write(py, arc) n_files += 1 # common + mode-specific scripts for rel in COMMON_VERBATIM + MODE_SCRIPTS[mode]: src = ROOT / rel if not src.exists(): print(f"[warn] missing: {src} — skipped", file=sys.stderr) continue zf.write(src, rel) n_files += 1 # data if include_data: for local_dir, arc_prefix in data_map: if not local_dir.exists(): print(f"[warn] data dir not found: {local_dir} — skipped", file=sys.stderr) continue pts = sorted(local_dir.glob("*.pt")) for pt in pts: zf.write(pt, f"{arc_prefix}/{pt.name}") n_files += 1 print(f"[data] {arc_prefix}: {len(pts)} files") else: print("[data] skipped (--no-data)") # pretrained checkpoint (finetune mode only, opt-in) if mode == "finetune" and include_checkpoint: if not PRETRAINED_CKPT.exists(): print(f"[warn] checkpoint not found: {PRETRAINED_CKPT} — skipped", file=sys.stderr) else: zf.write(PRETRAINED_CKPT, "checkpoints/pretrained.pt") n_files += 1 ckpt_mb = PRETRAINED_CKPT.stat().st_size / 1_048_576 print(f"[ckpt] checkpoints/pretrained.pt ({ckpt_mb:.1f} MB)") size_mb = out_path.stat().st_size / 1_048_576 print(f"[done] {out_path} ({n_files} files, {size_mb:.1f} MB)") def main() -> None: ap = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter, ) ap.add_argument( "--mode", choices=["pretrain", "finetune"], default="pretrain", help="Bundle mode: 'pretrain' (default) or 'finetune'.", ) ap.add_argument( "--output", type=Path, default=None, help="Output zip path. Default: output/hamori_colab_.zip.", ) ap.add_argument( "--no-data", action="store_true", help="Exclude data files (code-only bundle).", ) ap.add_argument( "--include-checkpoint", action="store_true", dest="include_checkpoint", help="(finetune mode) Include checkpoints/pretrained.pt in the zip.", ) args = ap.parse_args() out_path = args.output or OUT_DIR / f"hamori_colab_{args.mode}.zip" build_zip( out_path=out_path, mode=args.mode, include_data=not args.no_data, include_checkpoint=args.include_checkpoint, ) if __name__ == "__main__": main()