From 8672c10f78d94a146885e1a05299a65567ef9a7a Mon Sep 17 00:00:00 2001 From: Masahiko AMANO Date: Tue, 19 May 2026 10:28:17 +0300 Subject: [PATCH] chore: initialize project scaffold Add .gitignore (excludes .claude/, venv, checkpoints, processed data, external corpora), .gitattributes (LF normalization, binary markers), full directory tree with .gitkeep placeholders, and src __init__ stubs. Co-Authored-By: Claude Sonnet 4.6 --- .gitattributes | 32 ++++++ .gitignore | 61 ++++++++++ CLAUDE.md | 172 ++++++++++++++++++++++++++++ checkpoints/.gitkeep | 0 data/holdout/.gitkeep | 0 data/processed/.gitkeep | 0 data/raw_user/.gitkeep | 0 src/__init__.py | 0 src/external_converters/__init__.py | 0 tests/fixtures/.gitkeep | 0 10 files changed, 265 insertions(+) create mode 100644 .gitattributes create mode 100644 .gitignore create mode 100644 CLAUDE.md create mode 100644 checkpoints/.gitkeep create mode 100644 data/holdout/.gitkeep create mode 100644 data/processed/.gitkeep create mode 100644 data/raw_user/.gitkeep create mode 100644 src/__init__.py create mode 100644 src/external_converters/__init__.py create mode 100644 tests/fixtures/.gitkeep diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..b2b80d5 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,32 @@ +# Normalize line endings to LF on commit (cross-platform safety) +* text=auto eol=lf + +# Python source +*.py text eol=lf + +# Custom text formats +*.chord text eol=lf +*.md text eol=lf +*.txt text eol=lf +*.csv text eol=lf +*.json text eol=lf +*.yaml text eol=lf +*.yml text eol=lf +*.toml text eol=lf +*.cfg text eol=lf +*.ini text eol=lf + +# Binary assets — never diff/merge +*.pt binary +*.pth binary +*.ckpt binary +*.pkl binary +*.mid binary +*.midi binary +*.png binary +*.jpg binary +*.jpeg binary +*.pdf binary +*.zip binary +*.gz binary +*.tar binary diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..83016e0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,61 @@ +# Python +__pycache__/ +*.py[cod] +*.pyo +*.pyd +.Python +*.egg-info/ +dist/ +build/ +*.egg +.eggs/ + +# Virtual environments +.venv/ +venv/ +env/ +ENV/ + +# pytest +.pytest_cache/ +.cache/ +htmlcov/ +.coverage +coverage.xml + +# Jupyter +.ipynb_checkpoints/ +*.ipynb_checkpoints + +# Model checkpoints (large binaries — commit only intentionally) +checkpoints/*.pt +checkpoints/*.pth +checkpoints/*.ckpt + +# Processed data (reproducible from source) +data/processed/*.pt +data/processed/*.pkl + +# External corpora (download separately; too large for git) +data/raw_external/ + +# OS +.DS_Store +Thumbs.db +desktop.ini + +# IDEs +.idea/ +*.swp +*.swo + +# Claude Code +.claude/ + +# Logs +*.log +logs/ + +# Misc +*.tmp +*.bak diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..713dd9e --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,172 @@ +# CLAUDE.md + +This file gives Claude Code persistent context for the project. Read it before any non-trivial task. + +## Project overview + +**Goal.** Train a small autoregressive transformer to generate harmonic periods (4–16 bar chord progressions) in the author's compositional style. Coursework deliverable for an ML class at RTU MIREA; also intended as a working creative tool. + +**Unit of generation.** A single closed harmonic phrase (a "period"), not a full song. + +**Pipeline.** +1. Hand-transcribe own compositions from REAPER DAW projects into `.chord` text files. +2. Parse `.chord` → factorized token sequences. +3. Pre-train on a public corpus (McGill Billboard or similar). +4. Fine-tune on the author's own corpus. +5. Sample new periods conditioned on mode / time / style / function / optional chord prefix. +6. Detokenize back to `.chord` + export to MIDI for use in REAPER. + +**Hard deadline.** Less than one month, ~50 hours of work budget. + +## Tech stack + +- **Python 3.11+** +- **PyTorch** (no Lightning unless complexity demands it — keep training loops readable) +- **music21** for chord symbol parsing (`music21.harmony.ChordSymbol`) +- **pretty_midi** for MIDI generation +- **pytest** for unit tests +- **matplotlib** for plots in the report +- **NumPy, pandas** as standard +- Optional: **Google Colab** for training if local hardware is insufficient. Model is small enough that CPU is viable. + +Avoid heavy abstractions. This is coursework, not a production system. Prefer simple imperative scripts over framework-style code. + +## Repository layout + +``` +chord-gen/ +├── CLAUDE.md ← this file +├── README.md +├── requirements.txt +├── docs/ +│ └── chord_format_spec.md ← authoritative format specification +├── data/ +│ ├── raw_user/ ← hand-transcribed .chord files (own corpus) +│ ├── raw_external/ ← public corpora (McGill Billboard etc.) +│ ├── processed/ ← tokenized .pt files ready for training +│ └── holdout/ ← held-out periods for evaluation +├── src/ +│ ├── __init__.py +│ ├── tokenizer.py ← .chord ↔ token sequences +│ ├── chord_parser.py ← chord symbol → (root, qual, ext, bass) +│ ├── midi_export.py ← .chord → MIDI for sanity check & user output +│ ├── dataset.py ← PyTorch Dataset over tokenized files +│ ├── model.py ← small transformer +│ ├── train.py ← pre-train and fine-tune entry points +│ ├── generate.py ← inference / sampling +│ ├── evaluate.py ← perplexity + distribution metrics +│ └── external_converters/ +│ └── mcgill_to_chord.py ← convert McGill Billboard to .chord +├── tests/ +│ ├── test_chord_parser.py +│ ├── test_tokenizer.py +│ ├── test_midi_export.py +│ └── fixtures/ +│ └── *.chord +├── notebooks/ +│ ├── 01_data_exploration.ipynb +│ ├── 02_training.ipynb +│ └── 03_evaluation.ipynb +└── checkpoints/ + ├── pretrained.pt + └── finetuned.pt +``` + +## The `.chord` format + +The authoritative specification is in `docs/chord_format_spec.md`. **Always read it before modifying anything that touches the format or the tokenizer.** Critical points summarized here for context only — if anything conflicts, the spec wins. + +- One file = one harmonic period (4–16 bars). +- Header lines start with `#`, list `title`, `key`, `time`, `subdivision`, `style`, optional `function`. +- Body: bars separated by `|`, exactly `subdivision` positions per bar (for 4/4), positions separated by single spaces. +- A position holds: chord symbol, `.` (hold previous), `NC` (no chord), or `?` (unknown). +- Chord symbols: `(/)?`. 18 qualities, 7 extensions, slash inversions are mandatory and meaningful. +- Tokenization: each new chord becomes exactly 4 tokens (`ROOT_x`, `QUAL_x`, `EXT_x`, `BASS_x`). Hold = `HOLD`. Bar end = `BAR`. Plus metadata tokens at the start. +- **Keys are normalized.** Before tokenization, the entire period is transposed: majors → C major, minors → A minor. The model never sees absolute keys. The vocabulary contains `MODE_major`/`MODE_minor` but no `KEY_x` tokens. +- Vocabulary size: ~81 tokens. + +## Model + +A small autoregressive transformer: +- Layers: 2–4 +- d_model: 128–256 +- Heads: 4–8 +- FFN dim: 4 × d_model +- Context length: 512 tokens (more than enough for any single period) +- Tied input/output embeddings +- Standard causal mask, next-token prediction with cross-entropy +- AdamW, cosine schedule, warmup ~5% of steps +- Dropout 0.1–0.2 + +Pre-training uses the full public corpus. Fine-tuning uses the own corpus with a **smaller learning rate** (e.g. 1e-5 vs 1e-4 for pre-training) and **few epochs** (5–15) to avoid catastrophic forgetting of harmonic regularities learned during pre-training. + +## Inference + +- Top-p sampling (nucleus, p ≈ 0.9) with temperature ≈ 1.0 as defaults. Tunable. +- No beam search — it generally hurts on generative tasks like this. +- Generation is conditioned by feeding the BOS + metadata tokens explicitly, then optionally a chord prefix from the user. +- After generation, transpose from C/Am to the user's requested key. +- Output: both a `.chord` file and a MIDI file. + +## Evaluation + +For the report: +1. **Perplexity** on the holdout set, comparing pre-trained baseline vs fine-tuned. +2. **Distribution shift plots** — histograms over chord qualities, extension presence, inversion frequency, root motion intervals — showing how fine-tuning moves the distribution toward the author's corpus. +3. **Qualitative cherry-picked generations** — 3 examples with the same seed/prefix, generated by baseline vs fine-tuned, rendered to MIDI. + +No formal blind listening test (out of scope for the deadline). + +## Working language + +- **Code, identifiers, code comments, log messages, commit messages: English.** +- **User-facing output, the academic report, and the README user guide: Russian** (per university requirements; the report must comply with GOST). +- **Conversations with the developer (the author): Russian.** + +When generating commit messages or code comments, write in English. When generating the report or any user-facing text, write in Russian. + +## Code style & conventions + +- Type hints on all public functions. +- Docstrings: one-line summary + Args/Returns. Keep them concise. +- No `print()` in library code — use the `logging` module. CLI scripts may use `print`. +- Constants in `UPPER_SNAKE_CASE` at module top. +- Vocabulary, token IDs, and label maps live in `src/tokenizer.py` as module-level constants. +- Random seeds: every training / generation script accepts a `--seed` flag and sets `torch.manual_seed`, `numpy.random.seed`, `random.seed`. +- Reproducibility is more important than performance. If a choice is between "fast" and "deterministic", choose deterministic. + +## Testing policy + +- Every parser/tokenizer/MIDI module has unit tests. +- Tests use small `.chord` fixtures in `tests/fixtures/`. +- Round-trip property: `tokenize(parse(file))` followed by `detokenize(...)` must reproduce the chord sequence (up to canonical normalization). +- Don't bother unit-testing the training loop or the model. Test the data path. + +## Things to never do + +- **Do not change the `.chord` format** without first updating `docs/chord_format_spec.md` and bumping its version number. The format is the contract between the human-readable data and the model; changing one side silently breaks everything. +- **Do not modify files in `data/holdout/` or use them during training.** Holdout is held out. +- **Do not add new model architectures "to compare"** unless explicitly asked. One model, done well, beats four half-done. +- **Do not implement bells and whistles** (web UI, real-time audio synthesis, beam search, voicing models). They are explicitly out of scope. +- **Do not silently round or coerce unrecognized chord symbols.** If a chord can't be parsed, raise an error with the file name, bar number, and position. Silent corruption of training data is the worst failure mode here. + +## Things to always do + +- When asked to add a feature, first identify which module it belongs in (`src/...`) and whether it requires a spec change. State this before writing code. +- When the user describes a bug, write a failing test first, then fix. +- When dependencies change, update `requirements.txt`. +- When adding a CLI script, include `--help` output and a usage example in the script's docstring. +- When generating a long answer, ask whether a CLI flag or a config file is preferred for new parameters. Default to CLI flags for simplicity. + +## Out-of-scope (explicit non-goals for this deliverable) + +- Melody generation +- Voice leading / voicing inside chords above the bass +- Rhythmic patterns inside a held chord +- Arrangement, timbre, dynamics +- Web interface / GUI +- Real-time MIDI integration with REAPER +- Modulation handling inside a single period +- J-Pop fine-tuning experiment (future work after coursework deadline) + +If the user asks for any of these, remind them it's out of scope and ask whether to proceed anyway or defer. diff --git a/checkpoints/.gitkeep b/checkpoints/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/data/holdout/.gitkeep b/data/holdout/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/data/processed/.gitkeep b/data/processed/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/data/raw_user/.gitkeep b/data/raw_user/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/external_converters/__init__.py b/src/external_converters/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/fixtures/.gitkeep b/tests/fixtures/.gitkeep new file mode 100644 index 0000000..e69de29