feat: implement .chord file parser and canonical transposer; freeze requirements

src/tokenizer.py:
  - parse_chord_file(Path) → ChordPeriod: reads header + bar body, strips //
    comments, validates bar position counts and chord symbols, raises
    ChordFormatError with filename and bar number on any violation.
  - transpose_to_canonical(ChordPeriod) → ChordPeriod: shifts all chord roots
    and bass notes by the semitone offset to C major / A minor; fast-path
    returns the original object when shift == 0.

tests/test_chord_file_parser.py: 39 tests covering parsing of 4 valid fixtures
  (C major, F# major, B minor, G# minor), error messages for 2 invalid
  fixtures, and transposition correctness including slash chord root+bass.

tests/fixtures/: 6 .chord fixture files (4 valid, 2 invalid).

requirements.txt: pinned to current latest stable versions
  (torch 2.12.0, music21 10.1.0, pretty_midi 0.2.11, matplotlib 3.10.9,
  numpy 2.4.6, pandas 3.0.3, pytest 9.0.3); Python >= 3.11 noted.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-19 15:27:57 +03:00
parent dd77de00d0
commit a473499fac
9 changed files with 612 additions and 7 deletions
+17 -7
View File
@@ -1,7 +1,17 @@
torch
music21
pretty_midi
pytest
matplotlib
numpy
pandas
# Python >= 3.11 required
# Tested on Python 3.12.10
# Core ML
torch==2.12.0
numpy==2.4.6
pandas==3.0.3
# Music processing
music21==10.1.0
pretty_midi==0.2.11
# Visualization
matplotlib==3.10.9
# Testing
pytest==9.0.3
+279
View File
@@ -0,0 +1,279 @@
"""Parser and canonical transposer for .chord files.
Public API (token-ID conversion will be added in the next step):
parse_chord_file(path: Path) -> ChordPeriod
transpose_to_canonical(period: ChordPeriod) -> ChordPeriod
See docs/chord_format_spec.md for the format specification.
"""
from __future__ import annotations
import logging
from dataclasses import dataclass, replace
from pathlib import Path
from src.chord_parser import ChordParseError, ChordTokens, parse_chord_symbol
log = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Exceptions
# ---------------------------------------------------------------------------
class ChordFormatError(ValueError):
"""Raised on a structural error in a .chord file."""
# ---------------------------------------------------------------------------
# Data model
# ---------------------------------------------------------------------------
@dataclass
class ChordPeriod:
"""One harmonic period parsed from a .chord file."""
title: str
key: str # e.g. 'F#_major', 'B_minor'
time: str # e.g. '4/4', '3/4', '6/8'
subdivision: int # 4 or 8
style: str
function: str # 'unspecified' when the header field is absent
bars: list[list[str]] # bars[bar][pos] = chord symbol | '.' | 'NC' | '?'
# ---------------------------------------------------------------------------
# Note tables shared with transposition logic
# ---------------------------------------------------------------------------
_CHROMATIC: list[str] = [
"C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"
]
_NOTE_INDEX: dict[str, int] = {n: i for i, n in enumerate(_CHROMATIC)}
_FLAT_TO_SHARP: dict[str, str] = {
"Cb": "B", "Db": "C#", "Eb": "D#", "Fb": "E",
"Gb": "F#", "Ab": "G#", "Bb": "A#",
}
_VALID_TIMES: frozenset[str] = frozenset({"4/4", "3/4", "6/8", "2/4", "12/8"})
_VALID_STYLES: frozenset[str] = frozenset(
{"user", "jpop", "classical", "jazz", "other"}
)
_VALID_FUNCTIONS: frozenset[str] = frozenset({
"verse", "prechorus", "chorus", "bridge",
"intro", "outro", "interlude", "other",
})
# ---------------------------------------------------------------------------
# Internal helpers
# ---------------------------------------------------------------------------
def _normalize_note(raw: str) -> str:
note = _FLAT_TO_SHARP.get(raw, raw)
if note not in _NOTE_INDEX:
raise ValueError(f"invalid note: {raw!r}")
return note
def _parse_note_from_key(s: str) -> str:
"""Parse the tonic note from a key string fragment (e.g. 'F#', 'Bb')."""
s = s.strip()
if not s or s[0] not in "CDEFGAB":
raise ValueError(f"invalid note: {s!r}")
if len(s) >= 2 and s[1] in "#b":
return _normalize_note(s[:2])
return _normalize_note(s[0])
def _expected_positions(time: str, subdivision: int) -> int:
"""Number of positions per bar for the given time signature and subdivision."""
num, denom = (int(x) for x in time.split("/"))
return (num * subdivision) // denom
def _tokens_to_symbol(t: ChordTokens) -> str:
"""Reconstruct a canonical, parseable chord symbol string from ChordTokens."""
quality_ext = t.quality + ("" if t.extension == "none" else t.extension)
bass_part = "" if t.bass == "root" else f"/{t.bass}"
return t.root + quality_ext + bass_part
def _transpose_note(note: str, shift: int) -> str:
return _CHROMATIC[(_NOTE_INDEX[note] + shift) % 12]
def _transpose_symbol(symbol: str, shift: int, fname: str, bar_no: int) -> str:
"""Transpose one position token by *shift* semitones.
Structural tokens ('.', 'NC', '?') pass through unchanged.
"""
if symbol in (".", "NC", "?"):
return symbol
try:
t = parse_chord_symbol(symbol)
except ChordParseError as exc:
raise ChordFormatError(f"{fname}, bar {bar_no}: {exc}") from exc
new_root = _transpose_note(t.root, shift)
new_bass = "root" if t.bass == "root" else _transpose_note(t.bass, shift)
return _tokens_to_symbol(ChordTokens(new_root, t.quality, t.extension, new_bass))
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def parse_chord_file(path: Path) -> ChordPeriod:
"""Parse a .chord file into a ChordPeriod.
Args:
path: Path to the .chord file (UTF-8 encoded).
Returns:
ChordPeriod with header metadata and a list of bars.
Raises:
ChordFormatError: On missing/invalid header fields, wrong bar
position count, or unrecognised chord symbols.
"""
fname = path.name
text = path.read_text(encoding="utf-8")
header: dict[str, str] = {}
body_lines: list[str] = []
for raw_line in text.splitlines():
# Strip inline // comments
comment_pos = raw_line.find("//")
line = raw_line[:comment_pos].rstrip() if comment_pos != -1 else raw_line.rstrip()
if not line.strip():
continue
if line.lstrip().startswith("#"):
content = line.lstrip()[1:].strip()
if ":" in content:
k, v = content.split(":", 1)
header[k.strip().lower()] = v.strip()
else:
body_lines.append(line)
# --- Validate required header fields ---
for req in ("title", "key", "time", "subdivision", "style"):
if req not in header:
raise ChordFormatError(f"{fname}: missing required header field '{req}'")
raw_time = header["time"]
if raw_time not in _VALID_TIMES:
raise ChordFormatError(f"{fname}: invalid time signature '{raw_time}'")
try:
subdivision = int(header["subdivision"])
except ValueError:
raise ChordFormatError(f"{fname}: subdivision must be an integer")
if subdivision not in (4, 8):
raise ChordFormatError(
f"{fname}: subdivision must be 4 or 8, got {subdivision}"
)
style = header["style"]
if style not in _VALID_STYLES:
raise ChordFormatError(f"{fname}: invalid style '{style}'")
raw_function = header.get("function", "")
if raw_function and raw_function not in _VALID_FUNCTIONS:
raise ChordFormatError(f"{fname}: invalid function '{raw_function}'")
function = raw_function if raw_function else "unspecified"
key = header["key"]
key_parts = key.split("_")
if len(key_parts) < 2 or key_parts[-1] not in ("major", "minor"):
raise ChordFormatError(f"{fname}: invalid key format '{key}'")
# --- Parse bars from body ---
# Join all body lines; split on '|'; non-empty segments are bar contents.
body_text = " ".join(body_lines)
raw_segments = [seg.strip() for seg in body_text.split("|")]
bar_contents = [seg for seg in raw_segments if seg]
if not bar_contents:
raise ChordFormatError(f"{fname}: no bars found in body")
expected = _expected_positions(raw_time, subdivision)
bars: list[list[str]] = []
for bar_no, content in enumerate(bar_contents, start=1):
positions = content.split()
if len(positions) != expected:
raise ChordFormatError(
f"{fname}, bar {bar_no}: expected {expected} positions,"
f" got {len(positions)}"
)
for pos_no, token in enumerate(positions, start=1):
if token in (".", "NC", "?"):
continue
try:
parse_chord_symbol(token)
except ChordParseError as exc:
raise ChordFormatError(
f"{fname}, bar {bar_no}, pos {pos_no}: {exc}"
) from exc
bars.append(positions)
return ChordPeriod(
title=header["title"],
key=key,
time=raw_time,
subdivision=subdivision,
style=style,
function=function,
bars=bars,
)
def transpose_to_canonical(period: ChordPeriod) -> ChordPeriod:
"""Transpose a period to C major (major) or A minor (minor).
Args:
period: A ChordPeriod as returned by parse_chord_file.
Returns:
A new ChordPeriod with all chord roots and bass notes transposed
and the 'key' field updated to 'C_major' or 'A_minor'.
Returns the original object unchanged when it is already canonical.
Raises:
ChordFormatError: If the key field is malformed.
"""
key = period.key
parts = key.split("_")
if len(parts) < 2 or parts[-1] not in ("major", "minor"):
raise ChordFormatError(f"invalid key: {key!r}")
try:
tonic = _parse_note_from_key(parts[0])
except ValueError as exc:
raise ChordFormatError(f"invalid key tonic: {parts[0]!r}") from exc
mode = parts[-1]
canonical_index = 0 if mode == "major" else 9 # C = 0, A = 9
shift = (canonical_index - _NOTE_INDEX[tonic]) % 12
if shift == 0:
return period # already canonical
fname = "<transposition>"
new_bars: list[list[str]] = [
[_transpose_symbol(sym, shift, fname, bar_no)
for sym in bar]
for bar_no, bar in enumerate(period.bars, start=1)
]
canonical_key = "C_major" if mode == "major" else "A_minor"
return replace(period, key=canonical_key, bars=new_bars)
+7
View File
@@ -0,0 +1,7 @@
# title: Wrong position count
# key: C_major
# time: 4/4
# subdivision: 4
# style: user
| C . . . . | G . . . |
+7
View File
@@ -0,0 +1,7 @@
# title: Invalid chord symbol
# key: C_major
# time: 4/4
# subdivision: 4
# style: user
| C . . . | Xyz . . . |
+7
View File
@@ -0,0 +1,7 @@
# title: B minor test
# key: B_minor
# time: 4/4
# subdivision: 4
# style: user
| Bm . . . | C#m7b5 . . . | D . . . | F#7 . . . |
+9
View File
@@ -0,0 +1,9 @@
# title: C major test
# key: C_major
# time: 4/4
# subdivision: 4
# style: user
# function: chorus
| C . . . | Am7 . . . | F/A . . . | G7 . . . | // first half
| Em7 . . . | Am7 . . . | Dm7 . . . | G7 . . . |
+7
View File
@@ -0,0 +1,7 @@
# title: F# major test
# key: F#_major
# time: 4/4
# subdivision: 4
# style: user
| F#maj7 . . . | D#m7 . . . | Bmaj7 . F#/A# . | C# . . . |
+7
View File
@@ -0,0 +1,7 @@
# title: G# minor test
# key: G#_minor
# time: 4/4
# subdivision: 4
# style: user
| G#m . . . | A#maj7 . . . | Bmaj7/F# . . . | D#7 . . . |
+272
View File
@@ -0,0 +1,272 @@
"""Tests for parse_chord_file() and transpose_to_canonical() in src/tokenizer.py.
Fixture files live in tests/fixtures/:
valid_c_major.chord — already canonical (C major, 8 bars, has a // comment)
valid_fsharp_major.chord — F# major with a slash chord (F#/A#)
valid_b_minor.chord — B minor
valid_gsharp_minor.chord — G# minor with a slash chord (Bmaj7/F#)
invalid_bar_count.chord — bar 1 has 5 positions instead of 4
invalid_chord_symbol.chord — bar 2 contains the invalid symbol 'Xyz'
"""
from pathlib import Path
import pytest
from src.chord_parser import ChordTokens, parse_chord_symbol
from src.tokenizer import ChordFormatError, ChordPeriod, parse_chord_file, transpose_to_canonical
FIXTURES = Path(__file__).parent / "fixtures"
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def fixture(name: str) -> Path:
return FIXTURES / name
# ---------------------------------------------------------------------------
# parse_chord_file — valid files
# ---------------------------------------------------------------------------
class TestParseChordFile:
def test_c_major_header_fields(self):
p = parse_chord_file(fixture("valid_c_major.chord"))
assert p.title == "C major test"
assert p.key == "C_major"
assert p.time == "4/4"
assert p.subdivision == 4
assert p.style == "user"
assert p.function == "chorus"
def test_c_major_bar_count(self):
p = parse_chord_file(fixture("valid_c_major.chord"))
assert len(p.bars) == 8
def test_c_major_bar_positions(self):
p = parse_chord_file(fixture("valid_c_major.chord"))
assert p.bars[0] == ["C", ".", ".", "."]
assert p.bars[2] == ["F/A", ".", ".", "."] # slash chord preserved verbatim
assert p.bars[7] == ["G7", ".", ".", "."]
def test_c_major_comments_stripped(self):
# The first bar line ends with '// first half'; no '//' should bleed into positions.
p = parse_chord_file(fixture("valid_c_major.chord"))
for bar in p.bars:
assert not any("//" in pos for pos in bar)
def test_each_bar_has_expected_position_count(self):
p = parse_chord_file(fixture("valid_c_major.chord"))
for bar in p.bars:
assert len(bar) == 4 # 4/4, subdivision=4
def test_fsharp_major_parses(self):
p = parse_chord_file(fixture("valid_fsharp_major.chord"))
assert p.key == "F#_major"
assert len(p.bars) == 4
assert p.bars[0][0] == "F#maj7"
assert p.bars[2][2] == "F#/A#" # slash chord preserved verbatim
def test_b_minor_parses(self):
p = parse_chord_file(fixture("valid_b_minor.chord"))
assert p.key == "B_minor"
assert p.function == "unspecified" # no function header field
assert p.bars[0][0] == "Bm"
assert p.bars[1][0] == "C#m7b5"
def test_gsharp_minor_parses(self):
p = parse_chord_file(fixture("valid_gsharp_minor.chord"))
assert p.key == "G#_minor"
assert len(p.bars) == 4
assert p.bars[2][0] == "Bmaj7/F#" # slash chord
def test_hold_positions_preserved(self):
p = parse_chord_file(fixture("valid_c_major.chord"))
# Every position after the first in each bar is '.'
for bar in p.bars:
assert bar[1] == "."
assert bar[2] == "."
assert bar[3] == "."
# ---------------------------------------------------------------------------
# parse_chord_file — error cases
# ---------------------------------------------------------------------------
class TestParseChordFileErrors:
def test_invalid_bar_count_raises(self):
with pytest.raises(ChordFormatError):
parse_chord_file(fixture("invalid_bar_count.chord"))
def test_invalid_bar_count_error_mentions_bar_number(self):
with pytest.raises(ChordFormatError) as exc_info:
parse_chord_file(fixture("invalid_bar_count.chord"))
assert "bar 1" in str(exc_info.value)
def test_invalid_bar_count_error_mentions_filename(self):
with pytest.raises(ChordFormatError) as exc_info:
parse_chord_file(fixture("invalid_bar_count.chord"))
assert "invalid_bar_count.chord" in str(exc_info.value)
def test_invalid_chord_symbol_raises(self):
with pytest.raises(ChordFormatError):
parse_chord_file(fixture("invalid_chord_symbol.chord"))
def test_invalid_chord_symbol_error_mentions_bar_number(self):
with pytest.raises(ChordFormatError) as exc_info:
parse_chord_file(fixture("invalid_chord_symbol.chord"))
assert "bar 2" in str(exc_info.value)
def test_invalid_chord_symbol_error_mentions_filename(self):
with pytest.raises(ChordFormatError) as exc_info:
parse_chord_file(fixture("invalid_chord_symbol.chord"))
assert "invalid_chord_symbol.chord" in str(exc_info.value)
# ---------------------------------------------------------------------------
# transpose_to_canonical — F# major → C major (shift = 6)
# ---------------------------------------------------------------------------
class TestTransposeFsharpMajor:
def setup_method(self):
self._period = parse_chord_file(fixture("valid_fsharp_major.chord"))
self._t = transpose_to_canonical(self._period)
def test_key_updated_to_c_major(self):
assert self._t.key == "C_major"
def test_tonic_chord_becomes_c(self):
# F#maj7 (bar 0) → Cmaj7
assert self._t.bars[0][0] == "Cmaj7"
def test_second_degree_chord(self):
# D#m7 (bar 1) → Am7 (D#=3, 3+6=9=A)
assert self._t.bars[1][0] == "Am7"
def test_fourth_degree_chord(self):
# Bmaj7 (bar 2 pos 0) → Fmaj7 (B=11, 11+6=17→5=F)
assert self._t.bars[2][0] == "Fmaj7"
def test_fifth_degree_chord(self):
# C# (bar 3) → G (C#=1, 1+6=7=G)
tokens = parse_chord_symbol(self._t.bars[3][0])
assert tokens.root == "G"
assert tokens.quality == "maj"
def test_slash_chord_root_transposed(self):
# F#/A# (bar 2, pos 2): root F# → C
tokens = parse_chord_symbol(self._t.bars[2][2])
assert tokens.root == "C"
def test_slash_chord_bass_transposed(self):
# F#/A# (bar 2, pos 2): bass A#(=10) → E (10+6=16→4=E)
tokens = parse_chord_symbol(self._t.bars[2][2])
assert tokens.bass == "E"
def test_slash_chord_full_tokens(self):
tokens = parse_chord_symbol(self._t.bars[2][2])
assert tokens == ChordTokens("C", "maj", "none", "E")
def test_hold_positions_unchanged(self):
# Bars 0, 1, 3 are single-chord bars: positions 13 must remain '.'.
# Bar 2 has a chord at position 2 (F#/A# → Cmaj/E) — tested separately.
for bar_idx in (0, 1, 3):
assert all(pos == "." for pos in self._t.bars[bar_idx][1:])
def test_bar_count_preserved(self):
assert len(self._t.bars) == len(self._period.bars)
# ---------------------------------------------------------------------------
# transpose_to_canonical — G# minor → A minor (shift = 1)
# ---------------------------------------------------------------------------
class TestTransposeGsharpMinor:
def setup_method(self):
self._period = parse_chord_file(fixture("valid_gsharp_minor.chord"))
self._t = transpose_to_canonical(self._period)
def test_key_updated_to_a_minor(self):
assert self._t.key == "A_minor"
def test_tonic_becomes_a(self):
# G#m (bar 0) → Am (G#=8, 8+1=9=A)
assert self._t.bars[0][0] == "Am"
def test_second_degree_chord(self):
# A#maj7 (bar 1) → Bmaj7 (A#=10, 10+1=11=B)
assert self._t.bars[1][0] == "Bmaj7"
def test_slash_chord_root_transposed(self):
# Bmaj7/F# (bar 2): root B(=11) → C (11+1=0=C)
tokens = parse_chord_symbol(self._t.bars[2][0])
assert tokens.root == "C"
def test_slash_chord_bass_transposed(self):
# Bmaj7/F# (bar 2): bass F#(=6) → G (6+1=7=G)
tokens = parse_chord_symbol(self._t.bars[2][0])
assert tokens.bass == "G"
def test_slash_chord_full_tokens(self):
tokens = parse_chord_symbol(self._t.bars[2][0])
assert tokens == ChordTokens("C", "maj7", "none", "G")
def test_fourth_bar(self):
# D#7 (bar 3) → E7 (D#=3, 3+1=4=E)
assert self._t.bars[3][0] == "E7"
# ---------------------------------------------------------------------------
# transpose_to_canonical — already canonical (C major)
# ---------------------------------------------------------------------------
class TestTransposeCMajorIdentity:
def test_returns_same_object(self):
# Fast path: shift == 0, original period returned unchanged.
p = parse_chord_file(fixture("valid_c_major.chord"))
t = transpose_to_canonical(p)
assert t is p
def test_key_unchanged(self):
p = parse_chord_file(fixture("valid_c_major.chord"))
assert transpose_to_canonical(p).key == "C_major"
# ---------------------------------------------------------------------------
# transpose_to_canonical — B minor → A minor (shift = 10)
# ---------------------------------------------------------------------------
class TestTransposeBMinor:
def setup_method(self):
self._period = parse_chord_file(fixture("valid_b_minor.chord"))
self._t = transpose_to_canonical(self._period)
def test_key_updated_to_a_minor(self):
assert self._t.key == "A_minor"
def test_tonic_becomes_a(self):
# Bm (B=11): 11+10=21→9=A
assert self._t.bars[0][0] == "Am"
def test_half_diminished_chord(self):
# C#m7b5 (C#=1): 1+10=11=B → Bm7b5
assert self._t.bars[1][0] == "Bm7b5"
def test_major_chord_transposed(self):
# D (D=2): 2+10=12→0=C → Cmaj
tokens = parse_chord_symbol(self._t.bars[2][0])
assert tokens.root == "C"
assert tokens.quality == "maj"
def test_dominant_seventh_transposed(self):
# F#7 (F#=6): 6+10=16→4=E → E7
assert self._t.bars[3][0] == "E7"