Files
hamori/scripts/make_colab_zip.py
T
H1K0 89770dd009 feat: add Colab bundle script and pre-training notebook
scripts/make_colab_zip.py packages src/, scripts/pretrain.py,
requirements.txt, and processed .pt files into hamori_colab.zip,
remapping data/processed/{train,val}/ -> data/processed/mcgill/{train,val}/
so pretrain.py finds the data without modification.

notebooks/colab_pretrain.ipynb guides through upload, extraction,
dependency install, training run, report display, and results download.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-20 13:00:03 +03:00

105 lines
3.3 KiB
Python

"""Package the hamori project for Google Colab pre-training.
Creates hamori_colab.zip containing:
- src/ (all Python modules)
- scripts/pretrain.py (pre-training script)
- requirements.txt
- data/processed/mcgill/train/*.pt (remapped from data/processed/train/)
- data/processed/mcgill/val/*.pt (remapped from data/processed/val/)
The local processed data lives at data/processed/{train,val}/ but pretrain.py
expects data/processed/mcgill/{train,val}/. This script remaps the paths
inside the zip so no code changes are needed on Colab.
Usage:
python scripts/make_colab_zip.py
python scripts/make_colab_zip.py --output my_bundle.zip
python scripts/make_colab_zip.py --no-data # skip .pt files (code only)
Output:
hamori_colab.zip (in project root by default)
"""
from __future__ import annotations
import argparse
import sys
import zipfile
from pathlib import Path
ROOT = Path(__file__).resolve().parent.parent
DEFAULT_OUT = ROOT / "hamori_colab.zip"
# Files/dirs to include verbatim (paths relative to ROOT)
VERBATIM: list[str] = [
"requirements.txt",
"scripts/pretrain.py",
]
SRC_DIR = ROOT / "src"
# Local data dirs → path inside zip
DATA_REMAP: list[tuple[Path, str]] = [
(ROOT / "data" / "processed" / "train", "data/processed/mcgill/train"),
(ROOT / "data" / "processed" / "val", "data/processed/mcgill/val"),
]
def build_zip(out_path: Path, include_data: bool) -> None:
out_path.parent.mkdir(parents=True, exist_ok=True)
n_files = 0
with zipfile.ZipFile(out_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
# src/ — all .py files
for py in sorted(SRC_DIR.rglob("*.py")):
arc = "src/" + py.relative_to(SRC_DIR).as_posix()
zf.write(py, arc)
n_files += 1
# verbatim files
for rel in VERBATIM:
src = ROOT / rel
if not src.exists():
print(f"[warn] missing: {src} — skipped", file=sys.stderr)
continue
zf.write(src, rel)
n_files += 1
# data files with path remapping
if include_data:
for local_dir, arc_prefix in DATA_REMAP:
if not local_dir.exists():
print(f"[warn] data dir not found: {local_dir} — skipped",
file=sys.stderr)
continue
pts = sorted(local_dir.glob("*.pt"))
for pt in pts:
arc = f"{arc_prefix}/{pt.name}"
zf.write(pt, arc)
n_files += 1
print(f"[data] {arc_prefix}: {len(pts)} files")
else:
print("[data] skipped (--no-data)")
size_mb = out_path.stat().st_size / 1_048_576
print(f"[done] {out_path} ({n_files} files, {size_mb:.1f} MB)")
def main() -> None:
ap = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
ap.add_argument("--output", type=Path, default=DEFAULT_OUT,
help="Output zip path (default: hamori_colab.zip)")
ap.add_argument("--no-data", action="store_true",
help="Exclude .pt data files (bundle code only).")
args = ap.parse_args()
build_zip(args.output, include_data=not args.no_data)
if __name__ == "__main__":
main()