hamori/notebooks/colab_pretrain.ipynb

{
 "nbformat": 4,
 "nbformat_minor": 5,
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.11.0"
  },
  "colab": {
   "provenance": [],
   "gpuType": "T4"
  },
  "accelerator": "GPU"
 },
 "cells": [
  {
   "cell_type": "markdown",
   "id": "title",
   "metadata": {},
   "source": [
    "# hamori — pre-training on McGill Billboard corpus\n",
    "\n",
    "This notebook runs `scripts/pretrain.py` on Google Colab (GPU T4 recommended).\n",
    "\n",
    "**Steps:**\n",
    "1. Check GPU\n",
    "2. Upload `hamori_colab.zip` (built locally via `python scripts/make_colab_zip.py`)\n",
    "3. Extract and install dependencies\n",
    "4. Run pre-training\n",
    "5. Download checkpoint and logs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "gpu-check",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ── 1. GPU check ────────────────────────────────────────────────────────────\n",
    "import torch\n",
    "if torch.cuda.is_available():\n",
    "    print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n",
    "    print(f\"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB\")\n",
    "else:\n",
    "    print(\"No GPU found — training will be slow on CPU.\")\n",
    "    print(\"Go to Runtime → Change runtime type → T4 GPU and re-run.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "upload",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ── 2. Upload hamori_colab.zip ───────────────────────────────────────────────\n",
    "# Build it locally first:  python scripts/make_colab_zip.py\n",
    "from google.colab import files\n",
    "uploaded = files.upload()   # select hamori_colab.zip from your machine\n",
    "print(\"Uploaded:\", list(uploaded.keys()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "extract",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ── 3. Extract and install dependencies ─────────────────────────────────────\n",
    "import zipfile, os\n",
    "\n",
    "WORK_DIR = \"/content/hamori\"\n",
    "os.makedirs(WORK_DIR, exist_ok=True)\n",
    "\n",
    "with zipfile.ZipFile(\"hamori_colab.zip\") as zf:\n",
    "    zf.extractall(WORK_DIR)\n",
    "    print(f\"Extracted {len(zf.namelist())} files to {WORK_DIR}\")\n",
    "\n",
    "os.chdir(WORK_DIR)\n",
    "print(\"Working directory:\", os.getcwd())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "install-deps",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Colab ships torch; only install the extra deps\n",
    "!pip install -q pretty_midi mido matplotlib"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "data-check",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ── 4. Verify data ───────────────────────────────────────────────────────────\n",
    "from pathlib import Path\n",
    "train_pt = list(Path(\"data/processed/mcgill/train\").glob(\"*.pt\"))\n",
    "val_pt   = list(Path(\"data/processed/mcgill/val\").glob(\"*.pt\"))\n",
    "print(f\"Train: {len(train_pt)} files\")\n",
    "print(f\"Val:   {len(val_pt)} files\")\n",
    "if not train_pt:\n",
    "    print(\"ERROR: no training data found — did you build the zip with data included?\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "pretrain",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ── 5. Pre-train ─────────────────────────────────────────────────────────────\n",
    "# Outputs:\n",
    "#   checkpoints/pretrained.pt\n",
    "#   checkpoints/pretrained.log.csv\n",
    "#   checkpoints/pretrained_curves.png\n",
    "#   checkpoints/pretrained.report.txt\n",
    "!python scripts/pretrain.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "show-report",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ── 6. Show report ───────────────────────────────────────────────────────────\n",
    "report = Path(\"checkpoints/pretrained.report.txt\")\n",
    "if report.exists():\n",
    "    print(report.read_text(encoding=\"utf-8\"))\n",
    "else:\n",
    "    print(\"Report not found — training may have failed.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "show-curves",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Show loss curves inline\n",
    "from IPython.display import Image\n",
    "Image(\"checkpoints/pretrained_curves.png\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "download",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ── 7. Download results ──────────────────────────────────────────────────────\n",
    "import shutil\n",
    "from google.colab import files\n",
    "\n",
    "# Bundle all checkpoint outputs into a single zip for download\n",
    "shutil.make_archive(\"/content/pretrain_results\", \"zip\", WORK_DIR, \"checkpoints\")\n",
    "files.download(\"/content/pretrain_results.zip\")"
   ]
  }
 ]
}