Files
hamori/scripts/make_colab_zip.py
H1K0 c4dd2fb690 refactor: reorganize data/processed/ into mcgill/ and user/ subdirs
Moved data/processed/{train,val,holdout}/ → data/processed/mcgill/{train,val,holdout}/
so both corpora have their own namespace under data/processed/.
Updated PRETRAIN_DATA paths in make_colab_zip.py accordingly
(path remap workaround no longer needed).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 19:47:32 +03:00

159 lines
5.2 KiB
Python

"""Package the hamori project for Google Colab pre-training or fine-tuning.
pretrain mode (default):
- src/ (all Python modules)
- scripts/pretrain.py
- requirements.txt
- data/processed/mcgill/train/*.pt
- data/processed/mcgill/val/*.pt
finetune mode:
- src/ (all Python modules)
- scripts/train.py
- requirements.txt
- data/processed/user/train/*.pt
- data/processed/user/val/*.pt
- checkpoints/pretrained.pt (only with --include-checkpoint)
Usage:
# Pre-training bundle (default)
python scripts/make_colab_zip.py
python scripts/make_colab_zip.py --mode pretrain
# Fine-tuning bundle (run prepare_data.py locally first)
python scripts/make_colab_zip.py --mode finetune
# Fine-tuning bundle with pretrained checkpoint included
python scripts/make_colab_zip.py --mode finetune --include-checkpoint
# Exclude data files (code only)
python scripts/make_colab_zip.py --mode finetune --no-data
# Custom output path
python scripts/make_colab_zip.py --mode pretrain --output my_bundle.zip
Outputs:
output/hamori_colab_pretrain.zip (pretrain mode)
output/hamori_colab_finetune.zip (finetune mode)
"""
from __future__ import annotations
import argparse
import sys
import zipfile
from pathlib import Path
ROOT = Path(__file__).resolve().parent.parent
SRC_DIR = ROOT / "src"
OUT_DIR = ROOT / "output"
COMMON_VERBATIM: list[str] = ["requirements.txt"]
MODE_SCRIPTS: dict[str, list[str]] = {
"pretrain": ["scripts/pretrain.py"],
"finetune": ["scripts/train.py"],
}
# Local dir → arc path inside zip
PRETRAIN_DATA: list[tuple[Path, str]] = [
(ROOT / "data" / "processed" / "mcgill" / "train", "data/processed/mcgill/train"),
(ROOT / "data" / "processed" / "mcgill" / "val", "data/processed/mcgill/val"),
]
FINETUNE_DATA: list[tuple[Path, str]] = [
(ROOT / "data" / "processed" / "user" / "train", "data/processed/user/train"),
(ROOT / "data" / "processed" / "user" / "val", "data/processed/user/val"),
]
PRETRAINED_CKPT = ROOT / "checkpoints" / "pretrained.pt"
def build_zip(out_path: Path, mode: str, include_data: bool,
include_checkpoint: bool) -> None:
out_path.parent.mkdir(parents=True, exist_ok=True)
n_files = 0
data_map = PRETRAIN_DATA if mode == "pretrain" else FINETUNE_DATA
with zipfile.ZipFile(out_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
# src/ — all .py files
for py in sorted(SRC_DIR.rglob("*.py")):
arc = "src/" + py.relative_to(SRC_DIR).as_posix()
zf.write(py, arc)
n_files += 1
# common + mode-specific scripts
for rel in COMMON_VERBATIM + MODE_SCRIPTS[mode]:
src = ROOT / rel
if not src.exists():
print(f"[warn] missing: {src} — skipped", file=sys.stderr)
continue
zf.write(src, rel)
n_files += 1
# data
if include_data:
for local_dir, arc_prefix in data_map:
if not local_dir.exists():
print(f"[warn] data dir not found: {local_dir} — skipped",
file=sys.stderr)
continue
pts = sorted(local_dir.glob("*.pt"))
for pt in pts:
zf.write(pt, f"{arc_prefix}/{pt.name}")
n_files += 1
print(f"[data] {arc_prefix}: {len(pts)} files")
else:
print("[data] skipped (--no-data)")
# pretrained checkpoint (finetune mode only, opt-in)
if mode == "finetune" and include_checkpoint:
if not PRETRAINED_CKPT.exists():
print(f"[warn] checkpoint not found: {PRETRAINED_CKPT} — skipped",
file=sys.stderr)
else:
zf.write(PRETRAINED_CKPT, "checkpoints/pretrained.pt")
n_files += 1
ckpt_mb = PRETRAINED_CKPT.stat().st_size / 1_048_576
print(f"[ckpt] checkpoints/pretrained.pt ({ckpt_mb:.1f} MB)")
size_mb = out_path.stat().st_size / 1_048_576
print(f"[done] {out_path} ({n_files} files, {size_mb:.1f} MB)")
def main() -> None:
ap = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
ap.add_argument(
"--mode", choices=["pretrain", "finetune"], default="pretrain",
help="Bundle mode: 'pretrain' (default) or 'finetune'.",
)
ap.add_argument(
"--output", type=Path, default=None,
help="Output zip path. Default: output/hamori_colab_<mode>.zip.",
)
ap.add_argument(
"--no-data", action="store_true",
help="Exclude data files (code-only bundle).",
)
ap.add_argument(
"--include-checkpoint", action="store_true", dest="include_checkpoint",
help="(finetune mode) Include checkpoints/pretrained.pt in the zip.",
)
args = ap.parse_args()
out_path = args.output or OUT_DIR / f"hamori_colab_{args.mode}.zip"
build_zip(
out_path=out_path,
mode=args.mode,
include_data=not args.no_data,
include_checkpoint=args.include_checkpoint,
)
if __name__ == "__main__":
main()