"""Package the hamori project for Google Colab pre-training. Creates hamori_colab.zip containing: - src/ (all Python modules) - scripts/pretrain.py (pre-training script) - requirements.txt - data/processed/mcgill/train/*.pt (remapped from data/processed/train/) - data/processed/mcgill/val/*.pt (remapped from data/processed/val/) The local processed data lives at data/processed/{train,val}/ but pretrain.py expects data/processed/mcgill/{train,val}/. This script remaps the paths inside the zip so no code changes are needed on Colab. Usage: python scripts/make_colab_zip.py python scripts/make_colab_zip.py --output my_bundle.zip python scripts/make_colab_zip.py --no-data # skip .pt files (code only) Output: hamori_colab.zip (in project root by default) """ from __future__ import annotations import argparse import sys import zipfile from pathlib import Path ROOT = Path(__file__).resolve().parent.parent DEFAULT_OUT = ROOT / "hamori_colab.zip" # Files/dirs to include verbatim (paths relative to ROOT) VERBATIM: list[str] = [ "requirements.txt", "scripts/pretrain.py", ] SRC_DIR = ROOT / "src" # Local data dirs → path inside zip DATA_REMAP: list[tuple[Path, str]] = [ (ROOT / "data" / "processed" / "train", "data/processed/mcgill/train"), (ROOT / "data" / "processed" / "val", "data/processed/mcgill/val"), ] def build_zip(out_path: Path, include_data: bool) -> None: out_path.parent.mkdir(parents=True, exist_ok=True) n_files = 0 with zipfile.ZipFile(out_path, "w", compression=zipfile.ZIP_DEFLATED) as zf: # src/ — all .py files for py in sorted(SRC_DIR.rglob("*.py")): arc = "src/" + py.relative_to(SRC_DIR).as_posix() zf.write(py, arc) n_files += 1 # verbatim files for rel in VERBATIM: src = ROOT / rel if not src.exists(): print(f"[warn] missing: {src} — skipped", file=sys.stderr) continue zf.write(src, rel) n_files += 1 # data files with path remapping if include_data: for local_dir, arc_prefix in DATA_REMAP: if not local_dir.exists(): print(f"[warn] data dir not found: {local_dir} — skipped", file=sys.stderr) continue pts = sorted(local_dir.glob("*.pt")) for pt in pts: arc = f"{arc_prefix}/{pt.name}" zf.write(pt, arc) n_files += 1 print(f"[data] {arc_prefix}: {len(pts)} files") else: print("[data] skipped (--no-data)") size_mb = out_path.stat().st_size / 1_048_576 print(f"[done] {out_path} ({n_files} files, {size_mb:.1f} MB)") def main() -> None: ap = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter, ) ap.add_argument("--output", type=Path, default=DEFAULT_OUT, help="Output zip path (default: hamori_colab.zip)") ap.add_argument("--no-data", action="store_true", help="Exclude .pt data files (bundle code only).") args = ap.parse_args() build_zip(args.output, include_data=not args.no_data) if __name__ == "__main__": main()