diff --git a/requirements.txt b/requirements.txt index ed8d17a..0f72d83 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,17 @@ -torch -music21 -pretty_midi -pytest -matplotlib -numpy -pandas +# Python >= 3.11 required +# Tested on Python 3.12.10 + +# Core ML +torch==2.12.0 +numpy==2.4.6 +pandas==3.0.3 + +# Music processing +music21==10.1.0 +pretty_midi==0.2.11 + +# Visualization +matplotlib==3.10.9 + +# Testing +pytest==9.0.3 diff --git a/src/tokenizer.py b/src/tokenizer.py new file mode 100644 index 0000000..50ddd87 --- /dev/null +++ b/src/tokenizer.py @@ -0,0 +1,279 @@ +"""Parser and canonical transposer for .chord files. + +Public API (token-ID conversion will be added in the next step): + parse_chord_file(path: Path) -> ChordPeriod + transpose_to_canonical(period: ChordPeriod) -> ChordPeriod + +See docs/chord_format_spec.md for the format specification. +""" + +from __future__ import annotations + +import logging +from dataclasses import dataclass, replace +from pathlib import Path + +from src.chord_parser import ChordParseError, ChordTokens, parse_chord_symbol + +log = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Exceptions +# --------------------------------------------------------------------------- + + +class ChordFormatError(ValueError): + """Raised on a structural error in a .chord file.""" + + +# --------------------------------------------------------------------------- +# Data model +# --------------------------------------------------------------------------- + + +@dataclass +class ChordPeriod: + """One harmonic period parsed from a .chord file.""" + + title: str + key: str # e.g. 'F#_major', 'B_minor' + time: str # e.g. '4/4', '3/4', '6/8' + subdivision: int # 4 or 8 + style: str + function: str # 'unspecified' when the header field is absent + bars: list[list[str]] # bars[bar][pos] = chord symbol | '.' | 'NC' | '?' + + +# --------------------------------------------------------------------------- +# Note tables shared with transposition logic +# --------------------------------------------------------------------------- + +_CHROMATIC: list[str] = [ + "C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B" +] +_NOTE_INDEX: dict[str, int] = {n: i for i, n in enumerate(_CHROMATIC)} + +_FLAT_TO_SHARP: dict[str, str] = { + "Cb": "B", "Db": "C#", "Eb": "D#", "Fb": "E", + "Gb": "F#", "Ab": "G#", "Bb": "A#", +} + +_VALID_TIMES: frozenset[str] = frozenset({"4/4", "3/4", "6/8", "2/4", "12/8"}) +_VALID_STYLES: frozenset[str] = frozenset( + {"user", "jpop", "classical", "jazz", "other"} +) +_VALID_FUNCTIONS: frozenset[str] = frozenset({ + "verse", "prechorus", "chorus", "bridge", + "intro", "outro", "interlude", "other", +}) + + +# --------------------------------------------------------------------------- +# Internal helpers +# --------------------------------------------------------------------------- + + +def _normalize_note(raw: str) -> str: + note = _FLAT_TO_SHARP.get(raw, raw) + if note not in _NOTE_INDEX: + raise ValueError(f"invalid note: {raw!r}") + return note + + +def _parse_note_from_key(s: str) -> str: + """Parse the tonic note from a key string fragment (e.g. 'F#', 'Bb').""" + s = s.strip() + if not s or s[0] not in "CDEFGAB": + raise ValueError(f"invalid note: {s!r}") + if len(s) >= 2 and s[1] in "#b": + return _normalize_note(s[:2]) + return _normalize_note(s[0]) + + +def _expected_positions(time: str, subdivision: int) -> int: + """Number of positions per bar for the given time signature and subdivision.""" + num, denom = (int(x) for x in time.split("/")) + return (num * subdivision) // denom + + +def _tokens_to_symbol(t: ChordTokens) -> str: + """Reconstruct a canonical, parseable chord symbol string from ChordTokens.""" + quality_ext = t.quality + ("" if t.extension == "none" else t.extension) + bass_part = "" if t.bass == "root" else f"/{t.bass}" + return t.root + quality_ext + bass_part + + +def _transpose_note(note: str, shift: int) -> str: + return _CHROMATIC[(_NOTE_INDEX[note] + shift) % 12] + + +def _transpose_symbol(symbol: str, shift: int, fname: str, bar_no: int) -> str: + """Transpose one position token by *shift* semitones. + + Structural tokens ('.', 'NC', '?') pass through unchanged. + """ + if symbol in (".", "NC", "?"): + return symbol + try: + t = parse_chord_symbol(symbol) + except ChordParseError as exc: + raise ChordFormatError(f"{fname}, bar {bar_no}: {exc}") from exc + new_root = _transpose_note(t.root, shift) + new_bass = "root" if t.bass == "root" else _transpose_note(t.bass, shift) + return _tokens_to_symbol(ChordTokens(new_root, t.quality, t.extension, new_bass)) + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + + +def parse_chord_file(path: Path) -> ChordPeriod: + """Parse a .chord file into a ChordPeriod. + + Args: + path: Path to the .chord file (UTF-8 encoded). + + Returns: + ChordPeriod with header metadata and a list of bars. + + Raises: + ChordFormatError: On missing/invalid header fields, wrong bar + position count, or unrecognised chord symbols. + """ + fname = path.name + text = path.read_text(encoding="utf-8") + + header: dict[str, str] = {} + body_lines: list[str] = [] + + for raw_line in text.splitlines(): + # Strip inline // comments + comment_pos = raw_line.find("//") + line = raw_line[:comment_pos].rstrip() if comment_pos != -1 else raw_line.rstrip() + + if not line.strip(): + continue + + if line.lstrip().startswith("#"): + content = line.lstrip()[1:].strip() + if ":" in content: + k, v = content.split(":", 1) + header[k.strip().lower()] = v.strip() + else: + body_lines.append(line) + + # --- Validate required header fields --- + for req in ("title", "key", "time", "subdivision", "style"): + if req not in header: + raise ChordFormatError(f"{fname}: missing required header field '{req}'") + + raw_time = header["time"] + if raw_time not in _VALID_TIMES: + raise ChordFormatError(f"{fname}: invalid time signature '{raw_time}'") + + try: + subdivision = int(header["subdivision"]) + except ValueError: + raise ChordFormatError(f"{fname}: subdivision must be an integer") + if subdivision not in (4, 8): + raise ChordFormatError( + f"{fname}: subdivision must be 4 or 8, got {subdivision}" + ) + + style = header["style"] + if style not in _VALID_STYLES: + raise ChordFormatError(f"{fname}: invalid style '{style}'") + + raw_function = header.get("function", "") + if raw_function and raw_function not in _VALID_FUNCTIONS: + raise ChordFormatError(f"{fname}: invalid function '{raw_function}'") + function = raw_function if raw_function else "unspecified" + + key = header["key"] + key_parts = key.split("_") + if len(key_parts) < 2 or key_parts[-1] not in ("major", "minor"): + raise ChordFormatError(f"{fname}: invalid key format '{key}'") + + # --- Parse bars from body --- + # Join all body lines; split on '|'; non-empty segments are bar contents. + body_text = " ".join(body_lines) + raw_segments = [seg.strip() for seg in body_text.split("|")] + bar_contents = [seg for seg in raw_segments if seg] + + if not bar_contents: + raise ChordFormatError(f"{fname}: no bars found in body") + + expected = _expected_positions(raw_time, subdivision) + bars: list[list[str]] = [] + + for bar_no, content in enumerate(bar_contents, start=1): + positions = content.split() + if len(positions) != expected: + raise ChordFormatError( + f"{fname}, bar {bar_no}: expected {expected} positions," + f" got {len(positions)}" + ) + for pos_no, token in enumerate(positions, start=1): + if token in (".", "NC", "?"): + continue + try: + parse_chord_symbol(token) + except ChordParseError as exc: + raise ChordFormatError( + f"{fname}, bar {bar_no}, pos {pos_no}: {exc}" + ) from exc + bars.append(positions) + + return ChordPeriod( + title=header["title"], + key=key, + time=raw_time, + subdivision=subdivision, + style=style, + function=function, + bars=bars, + ) + + +def transpose_to_canonical(period: ChordPeriod) -> ChordPeriod: + """Transpose a period to C major (major) or A minor (minor). + + Args: + period: A ChordPeriod as returned by parse_chord_file. + + Returns: + A new ChordPeriod with all chord roots and bass notes transposed + and the 'key' field updated to 'C_major' or 'A_minor'. + Returns the original object unchanged when it is already canonical. + + Raises: + ChordFormatError: If the key field is malformed. + """ + key = period.key + parts = key.split("_") + if len(parts) < 2 or parts[-1] not in ("major", "minor"): + raise ChordFormatError(f"invalid key: {key!r}") + + try: + tonic = _parse_note_from_key(parts[0]) + except ValueError as exc: + raise ChordFormatError(f"invalid key tonic: {parts[0]!r}") from exc + + mode = parts[-1] + canonical_index = 0 if mode == "major" else 9 # C = 0, A = 9 + shift = (canonical_index - _NOTE_INDEX[tonic]) % 12 + + if shift == 0: + return period # already canonical + + fname = "" + new_bars: list[list[str]] = [ + [_transpose_symbol(sym, shift, fname, bar_no) + for sym in bar] + for bar_no, bar in enumerate(period.bars, start=1) + ] + + canonical_key = "C_major" if mode == "major" else "A_minor" + return replace(period, key=canonical_key, bars=new_bars) diff --git a/tests/fixtures/invalid_bar_count.chord b/tests/fixtures/invalid_bar_count.chord new file mode 100644 index 0000000..036a16b --- /dev/null +++ b/tests/fixtures/invalid_bar_count.chord @@ -0,0 +1,7 @@ +# title: Wrong position count +# key: C_major +# time: 4/4 +# subdivision: 4 +# style: user + +| C . . . . | G . . . | diff --git a/tests/fixtures/invalid_chord_symbol.chord b/tests/fixtures/invalid_chord_symbol.chord new file mode 100644 index 0000000..3cabc8c --- /dev/null +++ b/tests/fixtures/invalid_chord_symbol.chord @@ -0,0 +1,7 @@ +# title: Invalid chord symbol +# key: C_major +# time: 4/4 +# subdivision: 4 +# style: user + +| C . . . | Xyz . . . | diff --git a/tests/fixtures/valid_b_minor.chord b/tests/fixtures/valid_b_minor.chord new file mode 100644 index 0000000..d13920d --- /dev/null +++ b/tests/fixtures/valid_b_minor.chord @@ -0,0 +1,7 @@ +# title: B minor test +# key: B_minor +# time: 4/4 +# subdivision: 4 +# style: user + +| Bm . . . | C#m7b5 . . . | D . . . | F#7 . . . | diff --git a/tests/fixtures/valid_c_major.chord b/tests/fixtures/valid_c_major.chord new file mode 100644 index 0000000..cc5b1ac --- /dev/null +++ b/tests/fixtures/valid_c_major.chord @@ -0,0 +1,9 @@ +# title: C major test +# key: C_major +# time: 4/4 +# subdivision: 4 +# style: user +# function: chorus + +| C . . . | Am7 . . . | F/A . . . | G7 . . . | // first half +| Em7 . . . | Am7 . . . | Dm7 . . . | G7 . . . | diff --git a/tests/fixtures/valid_fsharp_major.chord b/tests/fixtures/valid_fsharp_major.chord new file mode 100644 index 0000000..1de4772 --- /dev/null +++ b/tests/fixtures/valid_fsharp_major.chord @@ -0,0 +1,7 @@ +# title: F# major test +# key: F#_major +# time: 4/4 +# subdivision: 4 +# style: user + +| F#maj7 . . . | D#m7 . . . | Bmaj7 . F#/A# . | C# . . . | diff --git a/tests/fixtures/valid_gsharp_minor.chord b/tests/fixtures/valid_gsharp_minor.chord new file mode 100644 index 0000000..443474d --- /dev/null +++ b/tests/fixtures/valid_gsharp_minor.chord @@ -0,0 +1,7 @@ +# title: G# minor test +# key: G#_minor +# time: 4/4 +# subdivision: 4 +# style: user + +| G#m . . . | A#maj7 . . . | Bmaj7/F# . . . | D#7 . . . | diff --git a/tests/test_chord_file_parser.py b/tests/test_chord_file_parser.py new file mode 100644 index 0000000..a7158d9 --- /dev/null +++ b/tests/test_chord_file_parser.py @@ -0,0 +1,272 @@ +"""Tests for parse_chord_file() and transpose_to_canonical() in src/tokenizer.py. + +Fixture files live in tests/fixtures/: + valid_c_major.chord — already canonical (C major, 8 bars, has a // comment) + valid_fsharp_major.chord — F# major with a slash chord (F#/A#) + valid_b_minor.chord — B minor + valid_gsharp_minor.chord — G# minor with a slash chord (Bmaj7/F#) + invalid_bar_count.chord — bar 1 has 5 positions instead of 4 + invalid_chord_symbol.chord — bar 2 contains the invalid symbol 'Xyz' +""" + +from pathlib import Path + +import pytest + +from src.chord_parser import ChordTokens, parse_chord_symbol +from src.tokenizer import ChordFormatError, ChordPeriod, parse_chord_file, transpose_to_canonical + +FIXTURES = Path(__file__).parent / "fixtures" + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def fixture(name: str) -> Path: + return FIXTURES / name + + +# --------------------------------------------------------------------------- +# parse_chord_file — valid files +# --------------------------------------------------------------------------- + + +class TestParseChordFile: + def test_c_major_header_fields(self): + p = parse_chord_file(fixture("valid_c_major.chord")) + assert p.title == "C major test" + assert p.key == "C_major" + assert p.time == "4/4" + assert p.subdivision == 4 + assert p.style == "user" + assert p.function == "chorus" + + def test_c_major_bar_count(self): + p = parse_chord_file(fixture("valid_c_major.chord")) + assert len(p.bars) == 8 + + def test_c_major_bar_positions(self): + p = parse_chord_file(fixture("valid_c_major.chord")) + assert p.bars[0] == ["C", ".", ".", "."] + assert p.bars[2] == ["F/A", ".", ".", "."] # slash chord preserved verbatim + assert p.bars[7] == ["G7", ".", ".", "."] + + def test_c_major_comments_stripped(self): + # The first bar line ends with '// first half'; no '//' should bleed into positions. + p = parse_chord_file(fixture("valid_c_major.chord")) + for bar in p.bars: + assert not any("//" in pos for pos in bar) + + def test_each_bar_has_expected_position_count(self): + p = parse_chord_file(fixture("valid_c_major.chord")) + for bar in p.bars: + assert len(bar) == 4 # 4/4, subdivision=4 + + def test_fsharp_major_parses(self): + p = parse_chord_file(fixture("valid_fsharp_major.chord")) + assert p.key == "F#_major" + assert len(p.bars) == 4 + assert p.bars[0][0] == "F#maj7" + assert p.bars[2][2] == "F#/A#" # slash chord preserved verbatim + + def test_b_minor_parses(self): + p = parse_chord_file(fixture("valid_b_minor.chord")) + assert p.key == "B_minor" + assert p.function == "unspecified" # no function header field + assert p.bars[0][0] == "Bm" + assert p.bars[1][0] == "C#m7b5" + + def test_gsharp_minor_parses(self): + p = parse_chord_file(fixture("valid_gsharp_minor.chord")) + assert p.key == "G#_minor" + assert len(p.bars) == 4 + assert p.bars[2][0] == "Bmaj7/F#" # slash chord + + def test_hold_positions_preserved(self): + p = parse_chord_file(fixture("valid_c_major.chord")) + # Every position after the first in each bar is '.' + for bar in p.bars: + assert bar[1] == "." + assert bar[2] == "." + assert bar[3] == "." + + +# --------------------------------------------------------------------------- +# parse_chord_file — error cases +# --------------------------------------------------------------------------- + + +class TestParseChordFileErrors: + def test_invalid_bar_count_raises(self): + with pytest.raises(ChordFormatError): + parse_chord_file(fixture("invalid_bar_count.chord")) + + def test_invalid_bar_count_error_mentions_bar_number(self): + with pytest.raises(ChordFormatError) as exc_info: + parse_chord_file(fixture("invalid_bar_count.chord")) + assert "bar 1" in str(exc_info.value) + + def test_invalid_bar_count_error_mentions_filename(self): + with pytest.raises(ChordFormatError) as exc_info: + parse_chord_file(fixture("invalid_bar_count.chord")) + assert "invalid_bar_count.chord" in str(exc_info.value) + + def test_invalid_chord_symbol_raises(self): + with pytest.raises(ChordFormatError): + parse_chord_file(fixture("invalid_chord_symbol.chord")) + + def test_invalid_chord_symbol_error_mentions_bar_number(self): + with pytest.raises(ChordFormatError) as exc_info: + parse_chord_file(fixture("invalid_chord_symbol.chord")) + assert "bar 2" in str(exc_info.value) + + def test_invalid_chord_symbol_error_mentions_filename(self): + with pytest.raises(ChordFormatError) as exc_info: + parse_chord_file(fixture("invalid_chord_symbol.chord")) + assert "invalid_chord_symbol.chord" in str(exc_info.value) + + +# --------------------------------------------------------------------------- +# transpose_to_canonical — F# major → C major (shift = 6) +# --------------------------------------------------------------------------- + + +class TestTransposeFsharpMajor: + def setup_method(self): + self._period = parse_chord_file(fixture("valid_fsharp_major.chord")) + self._t = transpose_to_canonical(self._period) + + def test_key_updated_to_c_major(self): + assert self._t.key == "C_major" + + def test_tonic_chord_becomes_c(self): + # F#maj7 (bar 0) → Cmaj7 + assert self._t.bars[0][0] == "Cmaj7" + + def test_second_degree_chord(self): + # D#m7 (bar 1) → Am7 (D#=3, 3+6=9=A) + assert self._t.bars[1][0] == "Am7" + + def test_fourth_degree_chord(self): + # Bmaj7 (bar 2 pos 0) → Fmaj7 (B=11, 11+6=17→5=F) + assert self._t.bars[2][0] == "Fmaj7" + + def test_fifth_degree_chord(self): + # C# (bar 3) → G (C#=1, 1+6=7=G) + tokens = parse_chord_symbol(self._t.bars[3][0]) + assert tokens.root == "G" + assert tokens.quality == "maj" + + def test_slash_chord_root_transposed(self): + # F#/A# (bar 2, pos 2): root F# → C + tokens = parse_chord_symbol(self._t.bars[2][2]) + assert tokens.root == "C" + + def test_slash_chord_bass_transposed(self): + # F#/A# (bar 2, pos 2): bass A#(=10) → E (10+6=16→4=E) + tokens = parse_chord_symbol(self._t.bars[2][2]) + assert tokens.bass == "E" + + def test_slash_chord_full_tokens(self): + tokens = parse_chord_symbol(self._t.bars[2][2]) + assert tokens == ChordTokens("C", "maj", "none", "E") + + def test_hold_positions_unchanged(self): + # Bars 0, 1, 3 are single-chord bars: positions 1–3 must remain '.'. + # Bar 2 has a chord at position 2 (F#/A# → Cmaj/E) — tested separately. + for bar_idx in (0, 1, 3): + assert all(pos == "." for pos in self._t.bars[bar_idx][1:]) + + def test_bar_count_preserved(self): + assert len(self._t.bars) == len(self._period.bars) + + +# --------------------------------------------------------------------------- +# transpose_to_canonical — G# minor → A minor (shift = 1) +# --------------------------------------------------------------------------- + + +class TestTransposeGsharpMinor: + def setup_method(self): + self._period = parse_chord_file(fixture("valid_gsharp_minor.chord")) + self._t = transpose_to_canonical(self._period) + + def test_key_updated_to_a_minor(self): + assert self._t.key == "A_minor" + + def test_tonic_becomes_a(self): + # G#m (bar 0) → Am (G#=8, 8+1=9=A) + assert self._t.bars[0][0] == "Am" + + def test_second_degree_chord(self): + # A#maj7 (bar 1) → Bmaj7 (A#=10, 10+1=11=B) + assert self._t.bars[1][0] == "Bmaj7" + + def test_slash_chord_root_transposed(self): + # Bmaj7/F# (bar 2): root B(=11) → C (11+1=0=C) + tokens = parse_chord_symbol(self._t.bars[2][0]) + assert tokens.root == "C" + + def test_slash_chord_bass_transposed(self): + # Bmaj7/F# (bar 2): bass F#(=6) → G (6+1=7=G) + tokens = parse_chord_symbol(self._t.bars[2][0]) + assert tokens.bass == "G" + + def test_slash_chord_full_tokens(self): + tokens = parse_chord_symbol(self._t.bars[2][0]) + assert tokens == ChordTokens("C", "maj7", "none", "G") + + def test_fourth_bar(self): + # D#7 (bar 3) → E7 (D#=3, 3+1=4=E) + assert self._t.bars[3][0] == "E7" + + +# --------------------------------------------------------------------------- +# transpose_to_canonical — already canonical (C major) +# --------------------------------------------------------------------------- + + +class TestTransposeCMajorIdentity: + def test_returns_same_object(self): + # Fast path: shift == 0, original period returned unchanged. + p = parse_chord_file(fixture("valid_c_major.chord")) + t = transpose_to_canonical(p) + assert t is p + + def test_key_unchanged(self): + p = parse_chord_file(fixture("valid_c_major.chord")) + assert transpose_to_canonical(p).key == "C_major" + + +# --------------------------------------------------------------------------- +# transpose_to_canonical — B minor → A minor (shift = 10) +# --------------------------------------------------------------------------- + + +class TestTransposeBMinor: + def setup_method(self): + self._period = parse_chord_file(fixture("valid_b_minor.chord")) + self._t = transpose_to_canonical(self._period) + + def test_key_updated_to_a_minor(self): + assert self._t.key == "A_minor" + + def test_tonic_becomes_a(self): + # Bm (B=11): 11+10=21→9=A + assert self._t.bars[0][0] == "Am" + + def test_half_diminished_chord(self): + # C#m7b5 (C#=1): 1+10=11=B → Bm7b5 + assert self._t.bars[1][0] == "Bm7b5" + + def test_major_chord_transposed(self): + # D (D=2): 2+10=12→0=C → Cmaj + tokens = parse_chord_symbol(self._t.bars[2][0]) + assert tokens.root == "C" + assert tokens.quality == "maj" + + def test_dominant_seventh_transposed(self): + # F#7 (F#=6): 6+10=16→4=E → E7 + assert self._t.bars[3][0] == "E7"