feat: implement .chord file parser and canonical transposer; freeze requirements
src/tokenizer.py:
- parse_chord_file(Path) → ChordPeriod: reads header + bar body, strips //
comments, validates bar position counts and chord symbols, raises
ChordFormatError with filename and bar number on any violation.
- transpose_to_canonical(ChordPeriod) → ChordPeriod: shifts all chord roots
and bass notes by the semitone offset to C major / A minor; fast-path
returns the original object when shift == 0.
tests/test_chord_file_parser.py: 39 tests covering parsing of 4 valid fixtures
(C major, F# major, B minor, G# minor), error messages for 2 invalid
fixtures, and transposition correctness including slash chord root+bass.
tests/fixtures/: 6 .chord fixture files (4 valid, 2 invalid).
requirements.txt: pinned to current latest stable versions
(torch 2.12.0, music21 10.1.0, pretty_midi 0.2.11, matplotlib 3.10.9,
numpy 2.4.6, pandas 3.0.3, pytest 9.0.3); Python >= 3.11 noted.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,279 @@
|
||||
"""Parser and canonical transposer for .chord files.
|
||||
|
||||
Public API (token-ID conversion will be added in the next step):
|
||||
parse_chord_file(path: Path) -> ChordPeriod
|
||||
transpose_to_canonical(period: ChordPeriod) -> ChordPeriod
|
||||
|
||||
See docs/chord_format_spec.md for the format specification.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass, replace
|
||||
from pathlib import Path
|
||||
|
||||
from src.chord_parser import ChordParseError, ChordTokens, parse_chord_symbol
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Exceptions
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class ChordFormatError(ValueError):
|
||||
"""Raised on a structural error in a .chord file."""
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Data model
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@dataclass
|
||||
class ChordPeriod:
|
||||
"""One harmonic period parsed from a .chord file."""
|
||||
|
||||
title: str
|
||||
key: str # e.g. 'F#_major', 'B_minor'
|
||||
time: str # e.g. '4/4', '3/4', '6/8'
|
||||
subdivision: int # 4 or 8
|
||||
style: str
|
||||
function: str # 'unspecified' when the header field is absent
|
||||
bars: list[list[str]] # bars[bar][pos] = chord symbol | '.' | 'NC' | '?'
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Note tables shared with transposition logic
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_CHROMATIC: list[str] = [
|
||||
"C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"
|
||||
]
|
||||
_NOTE_INDEX: dict[str, int] = {n: i for i, n in enumerate(_CHROMATIC)}
|
||||
|
||||
_FLAT_TO_SHARP: dict[str, str] = {
|
||||
"Cb": "B", "Db": "C#", "Eb": "D#", "Fb": "E",
|
||||
"Gb": "F#", "Ab": "G#", "Bb": "A#",
|
||||
}
|
||||
|
||||
_VALID_TIMES: frozenset[str] = frozenset({"4/4", "3/4", "6/8", "2/4", "12/8"})
|
||||
_VALID_STYLES: frozenset[str] = frozenset(
|
||||
{"user", "jpop", "classical", "jazz", "other"}
|
||||
)
|
||||
_VALID_FUNCTIONS: frozenset[str] = frozenset({
|
||||
"verse", "prechorus", "chorus", "bridge",
|
||||
"intro", "outro", "interlude", "other",
|
||||
})
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Internal helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _normalize_note(raw: str) -> str:
|
||||
note = _FLAT_TO_SHARP.get(raw, raw)
|
||||
if note not in _NOTE_INDEX:
|
||||
raise ValueError(f"invalid note: {raw!r}")
|
||||
return note
|
||||
|
||||
|
||||
def _parse_note_from_key(s: str) -> str:
|
||||
"""Parse the tonic note from a key string fragment (e.g. 'F#', 'Bb')."""
|
||||
s = s.strip()
|
||||
if not s or s[0] not in "CDEFGAB":
|
||||
raise ValueError(f"invalid note: {s!r}")
|
||||
if len(s) >= 2 and s[1] in "#b":
|
||||
return _normalize_note(s[:2])
|
||||
return _normalize_note(s[0])
|
||||
|
||||
|
||||
def _expected_positions(time: str, subdivision: int) -> int:
|
||||
"""Number of positions per bar for the given time signature and subdivision."""
|
||||
num, denom = (int(x) for x in time.split("/"))
|
||||
return (num * subdivision) // denom
|
||||
|
||||
|
||||
def _tokens_to_symbol(t: ChordTokens) -> str:
|
||||
"""Reconstruct a canonical, parseable chord symbol string from ChordTokens."""
|
||||
quality_ext = t.quality + ("" if t.extension == "none" else t.extension)
|
||||
bass_part = "" if t.bass == "root" else f"/{t.bass}"
|
||||
return t.root + quality_ext + bass_part
|
||||
|
||||
|
||||
def _transpose_note(note: str, shift: int) -> str:
|
||||
return _CHROMATIC[(_NOTE_INDEX[note] + shift) % 12]
|
||||
|
||||
|
||||
def _transpose_symbol(symbol: str, shift: int, fname: str, bar_no: int) -> str:
|
||||
"""Transpose one position token by *shift* semitones.
|
||||
|
||||
Structural tokens ('.', 'NC', '?') pass through unchanged.
|
||||
"""
|
||||
if symbol in (".", "NC", "?"):
|
||||
return symbol
|
||||
try:
|
||||
t = parse_chord_symbol(symbol)
|
||||
except ChordParseError as exc:
|
||||
raise ChordFormatError(f"{fname}, bar {bar_no}: {exc}") from exc
|
||||
new_root = _transpose_note(t.root, shift)
|
||||
new_bass = "root" if t.bass == "root" else _transpose_note(t.bass, shift)
|
||||
return _tokens_to_symbol(ChordTokens(new_root, t.quality, t.extension, new_bass))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public API
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def parse_chord_file(path: Path) -> ChordPeriod:
|
||||
"""Parse a .chord file into a ChordPeriod.
|
||||
|
||||
Args:
|
||||
path: Path to the .chord file (UTF-8 encoded).
|
||||
|
||||
Returns:
|
||||
ChordPeriod with header metadata and a list of bars.
|
||||
|
||||
Raises:
|
||||
ChordFormatError: On missing/invalid header fields, wrong bar
|
||||
position count, or unrecognised chord symbols.
|
||||
"""
|
||||
fname = path.name
|
||||
text = path.read_text(encoding="utf-8")
|
||||
|
||||
header: dict[str, str] = {}
|
||||
body_lines: list[str] = []
|
||||
|
||||
for raw_line in text.splitlines():
|
||||
# Strip inline // comments
|
||||
comment_pos = raw_line.find("//")
|
||||
line = raw_line[:comment_pos].rstrip() if comment_pos != -1 else raw_line.rstrip()
|
||||
|
||||
if not line.strip():
|
||||
continue
|
||||
|
||||
if line.lstrip().startswith("#"):
|
||||
content = line.lstrip()[1:].strip()
|
||||
if ":" in content:
|
||||
k, v = content.split(":", 1)
|
||||
header[k.strip().lower()] = v.strip()
|
||||
else:
|
||||
body_lines.append(line)
|
||||
|
||||
# --- Validate required header fields ---
|
||||
for req in ("title", "key", "time", "subdivision", "style"):
|
||||
if req not in header:
|
||||
raise ChordFormatError(f"{fname}: missing required header field '{req}'")
|
||||
|
||||
raw_time = header["time"]
|
||||
if raw_time not in _VALID_TIMES:
|
||||
raise ChordFormatError(f"{fname}: invalid time signature '{raw_time}'")
|
||||
|
||||
try:
|
||||
subdivision = int(header["subdivision"])
|
||||
except ValueError:
|
||||
raise ChordFormatError(f"{fname}: subdivision must be an integer")
|
||||
if subdivision not in (4, 8):
|
||||
raise ChordFormatError(
|
||||
f"{fname}: subdivision must be 4 or 8, got {subdivision}"
|
||||
)
|
||||
|
||||
style = header["style"]
|
||||
if style not in _VALID_STYLES:
|
||||
raise ChordFormatError(f"{fname}: invalid style '{style}'")
|
||||
|
||||
raw_function = header.get("function", "")
|
||||
if raw_function and raw_function not in _VALID_FUNCTIONS:
|
||||
raise ChordFormatError(f"{fname}: invalid function '{raw_function}'")
|
||||
function = raw_function if raw_function else "unspecified"
|
||||
|
||||
key = header["key"]
|
||||
key_parts = key.split("_")
|
||||
if len(key_parts) < 2 or key_parts[-1] not in ("major", "minor"):
|
||||
raise ChordFormatError(f"{fname}: invalid key format '{key}'")
|
||||
|
||||
# --- Parse bars from body ---
|
||||
# Join all body lines; split on '|'; non-empty segments are bar contents.
|
||||
body_text = " ".join(body_lines)
|
||||
raw_segments = [seg.strip() for seg in body_text.split("|")]
|
||||
bar_contents = [seg for seg in raw_segments if seg]
|
||||
|
||||
if not bar_contents:
|
||||
raise ChordFormatError(f"{fname}: no bars found in body")
|
||||
|
||||
expected = _expected_positions(raw_time, subdivision)
|
||||
bars: list[list[str]] = []
|
||||
|
||||
for bar_no, content in enumerate(bar_contents, start=1):
|
||||
positions = content.split()
|
||||
if len(positions) != expected:
|
||||
raise ChordFormatError(
|
||||
f"{fname}, bar {bar_no}: expected {expected} positions,"
|
||||
f" got {len(positions)}"
|
||||
)
|
||||
for pos_no, token in enumerate(positions, start=1):
|
||||
if token in (".", "NC", "?"):
|
||||
continue
|
||||
try:
|
||||
parse_chord_symbol(token)
|
||||
except ChordParseError as exc:
|
||||
raise ChordFormatError(
|
||||
f"{fname}, bar {bar_no}, pos {pos_no}: {exc}"
|
||||
) from exc
|
||||
bars.append(positions)
|
||||
|
||||
return ChordPeriod(
|
||||
title=header["title"],
|
||||
key=key,
|
||||
time=raw_time,
|
||||
subdivision=subdivision,
|
||||
style=style,
|
||||
function=function,
|
||||
bars=bars,
|
||||
)
|
||||
|
||||
|
||||
def transpose_to_canonical(period: ChordPeriod) -> ChordPeriod:
|
||||
"""Transpose a period to C major (major) or A minor (minor).
|
||||
|
||||
Args:
|
||||
period: A ChordPeriod as returned by parse_chord_file.
|
||||
|
||||
Returns:
|
||||
A new ChordPeriod with all chord roots and bass notes transposed
|
||||
and the 'key' field updated to 'C_major' or 'A_minor'.
|
||||
Returns the original object unchanged when it is already canonical.
|
||||
|
||||
Raises:
|
||||
ChordFormatError: If the key field is malformed.
|
||||
"""
|
||||
key = period.key
|
||||
parts = key.split("_")
|
||||
if len(parts) < 2 or parts[-1] not in ("major", "minor"):
|
||||
raise ChordFormatError(f"invalid key: {key!r}")
|
||||
|
||||
try:
|
||||
tonic = _parse_note_from_key(parts[0])
|
||||
except ValueError as exc:
|
||||
raise ChordFormatError(f"invalid key tonic: {parts[0]!r}") from exc
|
||||
|
||||
mode = parts[-1]
|
||||
canonical_index = 0 if mode == "major" else 9 # C = 0, A = 9
|
||||
shift = (canonical_index - _NOTE_INDEX[tonic]) % 12
|
||||
|
||||
if shift == 0:
|
||||
return period # already canonical
|
||||
|
||||
fname = "<transposition>"
|
||||
new_bars: list[list[str]] = [
|
||||
[_transpose_symbol(sym, shift, fname, bar_no)
|
||||
for sym in bar]
|
||||
for bar_no, bar in enumerate(period.bars, start=1)
|
||||
]
|
||||
|
||||
canonical_key = "C_major" if mode == "major" else "A_minor"
|
||||
return replace(period, key=canonical_key, bars=new_bars)
|
||||
Reference in New Issue
Block a user