89770dd009
scripts/make_colab_zip.py packages src/, scripts/pretrain.py,
requirements.txt, and processed .pt files into hamori_colab.zip,
remapping data/processed/{train,val}/ -> data/processed/mcgill/{train,val}/
so pretrain.py finds the data without modification.
notebooks/colab_pretrain.ipynb guides through upload, extraction,
dependency install, training run, report display, and results download.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
5.8 KiB
5.8 KiB
hamori — pre-training on McGill Billboard corpus¶
This notebook runs scripts/pretrain.py on Google Colab (GPU T4 recommended).
Steps:
- Check GPU
- Upload
hamori_colab.zip(built locally viapython scripts/make_colab_zip.py) - Extract and install dependencies
- Run pre-training
- Download checkpoint and logs
In [ ]:
# ── 1. GPU check ────────────────────────────────────────────────────────────
import torch
if torch.cuda.is_available():
print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
print("No GPU found — training will be slow on CPU.")
print("Go to Runtime → Change runtime type → T4 GPU and re-run.")
In [ ]:
# ── 2. Upload hamori_colab.zip ───────────────────────────────────────────────
# Build it locally first: python scripts/make_colab_zip.py
from google.colab import files
uploaded = files.upload() # select hamori_colab.zip from your machine
print("Uploaded:", list(uploaded.keys()))
In [ ]:
# ── 3. Extract and install dependencies ─────────────────────────────────────
import zipfile, os
WORK_DIR = "/content/hamori"
os.makedirs(WORK_DIR, exist_ok=True)
with zipfile.ZipFile("hamori_colab.zip") as zf:
zf.extractall(WORK_DIR)
print(f"Extracted {len(zf.namelist())} files to {WORK_DIR}")
os.chdir(WORK_DIR)
print("Working directory:", os.getcwd())
In [ ]:
# Colab ships torch; only install the extra deps
!pip install -q pretty_midi mido matplotlib
In [ ]:
# ── 4. Verify data ───────────────────────────────────────────────────────────
from pathlib import Path
train_pt = list(Path("data/processed/mcgill/train").glob("*.pt"))
val_pt = list(Path("data/processed/mcgill/val").glob("*.pt"))
print(f"Train: {len(train_pt)} files")
print(f"Val: {len(val_pt)} files")
if not train_pt:
print("ERROR: no training data found — did you build the zip with data included?")
In [ ]:
# ── 5. Pre-train ─────────────────────────────────────────────────────────────
# Outputs:
# checkpoints/pretrained.pt
# checkpoints/pretrained.log.csv
# checkpoints/pretrained_curves.png
# checkpoints/pretrained.report.txt
!python scripts/pretrain.py
In [ ]:
# ── 6. Show report ───────────────────────────────────────────────────────────
report = Path("checkpoints/pretrained.report.txt")
if report.exists():
print(report.read_text(encoding="utf-8"))
else:
print("Report not found — training may have failed.")
In [ ]:
# Show loss curves inline
from IPython.display import Image
Image("checkpoints/pretrained_curves.png")
In [ ]:
# ── 7. Download results ──────────────────────────────────────────────────────
import shutil
from google.colab import files
# Bundle all checkpoint outputs into a single zip for download
shutil.make_archive("/content/pretrain_results", "zip", WORK_DIR, "checkpoints")
files.download("/content/pretrain_results.zip")