4fd8ece170
- STYLE_user renamed to STYLE_H1K0 in VOCAB (author's personal tag) - Style field now accepts any [A-Za-z][A-Za-z0-9_]* identifier in .chord files - Unknown styles fall back to STYLE_other at tokenization time with a log warning - Test fixtures updated to style: other; drop closed _VALID_STYLES frozenset - Spec bumped to v2.1: documents open style field, fallback behaviour, and §5.7 guide on registering a new style token Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
479 lines
16 KiB
Python
479 lines
16 KiB
Python
"""Parser, transposer, and tokenizer for .chord files.
|
|
|
|
Public API:
|
|
parse_chord_file(path: Path) -> ChordPeriod
|
|
transpose_to_canonical(period: ChordPeriod) -> ChordPeriod
|
|
tokenize_period(period: ChordPeriod) -> list[int]
|
|
detokenize_to_period(token_ids: list[int]) -> ChordPeriod
|
|
|
|
Vocabulary constants:
|
|
VOCAB -- 81-token ordered list; index == token ID
|
|
TOKEN_TO_ID -- {token_string: id}
|
|
ID_TO_TOKEN -- alias for VOCAB
|
|
|
|
See docs/chord_format_spec.md §5.2 for the vocabulary specification.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import re
|
|
from dataclasses import dataclass, replace
|
|
from pathlib import Path
|
|
|
|
from src.chord_parser import ChordParseError, ChordTokens, parse_chord_symbol
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Exceptions
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class ChordFormatError(ValueError):
|
|
"""Raised on a structural error in a .chord file."""
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Data model
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@dataclass
|
|
class ChordPeriod:
|
|
"""One harmonic period parsed from a .chord file."""
|
|
|
|
title: str
|
|
key: str # e.g. 'F#_major', 'B_minor'
|
|
time: str # e.g. '4/4', '3/4', '6/8'
|
|
subdivision: int # 4 or 8
|
|
style: str
|
|
function: str # 'unspecified' when the header field is absent
|
|
bars: list[list[str]] # bars[bar][pos] = chord symbol | '.' | 'NC' | '?'
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Note tables shared with transposition logic
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_CHROMATIC: list[str] = [
|
|
"C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"
|
|
]
|
|
_NOTE_INDEX: dict[str, int] = {n: i for i, n in enumerate(_CHROMATIC)}
|
|
|
|
_FLAT_TO_SHARP: dict[str, str] = {
|
|
"Cb": "B", "Db": "C#", "Eb": "D#", "Fb": "E",
|
|
"Gb": "F#", "Ab": "G#", "Bb": "A#",
|
|
}
|
|
|
|
_VALID_TIMES: frozenset[str] = frozenset({"4/4", "3/4", "6/8", "2/4", "12/8"})
|
|
_VALID_FUNCTIONS: frozenset[str] = frozenset({
|
|
"verse", "prechorus", "chorus", "bridge",
|
|
"intro", "outro", "interlude", "other",
|
|
})
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Token vocabulary (§5.2)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
VOCAB: list[str] = [
|
|
# Special (4)
|
|
"<BOS>", "<EOS>", "<PAD>", "<UNK>",
|
|
# Mode (2)
|
|
"MODE_major", "MODE_minor",
|
|
# Time signature (5)
|
|
"TIME_4/4", "TIME_3/4", "TIME_6/8", "TIME_2/4", "TIME_12/8",
|
|
# Subdivision (2)
|
|
"SUB_4", "SUB_8",
|
|
# Style (5)
|
|
"STYLE_H1K0", "STYLE_jpop", "STYLE_classical", "STYLE_jazz", "STYLE_other",
|
|
# Function (9)
|
|
"FUNC_verse", "FUNC_prechorus", "FUNC_chorus", "FUNC_bridge",
|
|
"FUNC_intro", "FUNC_outro", "FUNC_interlude", "FUNC_other", "FUNC_unspecified",
|
|
# Chord root — 12 pitch classes, sharps only (12)
|
|
"ROOT_C", "ROOT_C#", "ROOT_D", "ROOT_D#", "ROOT_E", "ROOT_F",
|
|
"ROOT_F#", "ROOT_G", "ROOT_G#", "ROOT_A", "ROOT_A#", "ROOT_B",
|
|
# Chord quality (18)
|
|
"QUAL_maj", "QUAL_m", "QUAL_dim", "QUAL_aug", "QUAL_sus2", "QUAL_sus4",
|
|
"QUAL_maj7", "QUAL_m7", "QUAL_7", "QUAL_m7b5", "QUAL_dim7", "QUAL_mM7",
|
|
"QUAL_7sus4", "QUAL_aug7", "QUAL_6", "QUAL_m6", "QUAL_add9", "QUAL_m_add9",
|
|
# Extension (8)
|
|
"EXT_none", "EXT_9", "EXT_b9", "EXT_#9", "EXT_11", "EXT_#11", "EXT_13", "EXT_b13",
|
|
# Bass note — 'root' sentinel + 12 pitch classes (13)
|
|
"BASS_root", "BASS_C", "BASS_C#", "BASS_D", "BASS_D#", "BASS_E", "BASS_F",
|
|
"BASS_F#", "BASS_G", "BASS_G#", "BASS_A", "BASS_A#", "BASS_B",
|
|
# Structural (3)
|
|
"HOLD", "NC", "BAR",
|
|
]
|
|
|
|
TOKEN_TO_ID: dict[str, int] = {tok: i for i, tok in enumerate(VOCAB)}
|
|
ID_TO_TOKEN: list[str] = VOCAB
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Internal helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _normalize_note(raw: str) -> str:
|
|
note = _FLAT_TO_SHARP.get(raw, raw)
|
|
if note not in _NOTE_INDEX:
|
|
raise ValueError(f"invalid note: {raw!r}")
|
|
return note
|
|
|
|
|
|
def _parse_note_from_key(s: str) -> str:
|
|
"""Parse the tonic note from a key string fragment (e.g. 'F#', 'Bb')."""
|
|
s = s.strip()
|
|
if not s or s[0] not in "CDEFGAB":
|
|
raise ValueError(f"invalid note: {s!r}")
|
|
if len(s) >= 2 and s[1] in "#b":
|
|
return _normalize_note(s[:2])
|
|
return _normalize_note(s[0])
|
|
|
|
|
|
def _expected_positions(time: str, subdivision: int) -> int:
|
|
"""Number of positions per bar for the given time signature and subdivision."""
|
|
num, denom = (int(x) for x in time.split("/"))
|
|
return (num * subdivision) // denom
|
|
|
|
|
|
def _tokens_to_symbol(t: ChordTokens) -> str:
|
|
"""Reconstruct a canonical, parseable chord symbol string from ChordTokens."""
|
|
quality_ext = t.quality + ("" if t.extension == "none" else t.extension)
|
|
bass_part = "" if t.bass == "root" else f"/{t.bass}"
|
|
return t.root + quality_ext + bass_part
|
|
|
|
|
|
def _transpose_note(note: str, shift: int) -> str:
|
|
return _CHROMATIC[(_NOTE_INDEX[note] + shift) % 12]
|
|
|
|
|
|
def _transpose_symbol(symbol: str, shift: int, fname: str, bar_no: int) -> str:
|
|
"""Transpose one position token by *shift* semitones.
|
|
|
|
Structural tokens ('.', 'NC', '?') pass through unchanged.
|
|
"""
|
|
if symbol in (".", "NC", "?"):
|
|
return symbol
|
|
try:
|
|
t = parse_chord_symbol(symbol)
|
|
except ChordParseError as exc:
|
|
raise ChordFormatError(f"{fname}, bar {bar_no}: {exc}") from exc
|
|
new_root = _transpose_note(t.root, shift)
|
|
new_bass = "root" if t.bass == "root" else _transpose_note(t.bass, shift)
|
|
return _tokens_to_symbol(ChordTokens(new_root, t.quality, t.extension, new_bass))
|
|
|
|
|
|
def _qual_token(quality: str) -> str:
|
|
"""Map canonical quality string → QUAL_x token name."""
|
|
return "QUAL_m_add9" if quality == "m(add9)" else f"QUAL_{quality}"
|
|
|
|
|
|
def _token_qual(token: str) -> str:
|
|
"""Map QUAL_x token name → canonical quality string."""
|
|
suffix = token[5:] # strip "QUAL_"
|
|
return "m(add9)" if suffix == "m_add9" else suffix
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Public API
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def parse_chord_file(path: Path) -> ChordPeriod:
|
|
"""Parse a .chord file into a ChordPeriod.
|
|
|
|
Args:
|
|
path: Path to the .chord file (UTF-8 encoded).
|
|
|
|
Returns:
|
|
ChordPeriod with header metadata and a list of bars.
|
|
|
|
Raises:
|
|
ChordFormatError: On missing/invalid header fields, wrong bar
|
|
position count, or unrecognised chord symbols.
|
|
"""
|
|
fname = path.name
|
|
text = path.read_text(encoding="utf-8")
|
|
|
|
header: dict[str, str] = {}
|
|
body_lines: list[str] = []
|
|
|
|
for raw_line in text.splitlines():
|
|
# Strip inline // comments
|
|
comment_pos = raw_line.find("//")
|
|
line = raw_line[:comment_pos].rstrip() if comment_pos != -1 else raw_line.rstrip()
|
|
|
|
if not line.strip():
|
|
continue
|
|
|
|
if line.lstrip().startswith("#"):
|
|
content = line.lstrip()[1:].strip()
|
|
if ":" in content:
|
|
k, v = content.split(":", 1)
|
|
header[k.strip().lower()] = v.strip()
|
|
else:
|
|
body_lines.append(line)
|
|
|
|
# --- Validate required header fields ---
|
|
for req in ("title", "key", "time", "subdivision", "style"):
|
|
if req not in header:
|
|
raise ChordFormatError(f"{fname}: missing required header field '{req}'")
|
|
|
|
raw_time = header["time"]
|
|
if raw_time not in _VALID_TIMES:
|
|
raise ChordFormatError(f"{fname}: invalid time signature '{raw_time}'")
|
|
|
|
try:
|
|
subdivision = int(header["subdivision"])
|
|
except ValueError:
|
|
raise ChordFormatError(f"{fname}: subdivision must be an integer")
|
|
if subdivision not in (4, 8):
|
|
raise ChordFormatError(
|
|
f"{fname}: subdivision must be 4 or 8, got {subdivision}"
|
|
)
|
|
|
|
style = header["style"]
|
|
if not re.match(r'^[A-Za-z][A-Za-z0-9_]*$', style):
|
|
raise ChordFormatError(
|
|
f"{fname}: invalid style '{style}' — must be a non-empty identifier"
|
|
" ([A-Za-z][A-Za-z0-9_]*)"
|
|
)
|
|
|
|
raw_function = header.get("function", "")
|
|
if raw_function and raw_function not in _VALID_FUNCTIONS:
|
|
raise ChordFormatError(f"{fname}: invalid function '{raw_function}'")
|
|
function = raw_function if raw_function else "unspecified"
|
|
|
|
key = header["key"]
|
|
key_parts = key.split("_")
|
|
if len(key_parts) < 2 or key_parts[-1] not in ("major", "minor"):
|
|
raise ChordFormatError(f"{fname}: invalid key format '{key}'")
|
|
|
|
# --- Parse bars from body ---
|
|
# Join all body lines; split on '|'; non-empty segments are bar contents.
|
|
body_text = " ".join(body_lines)
|
|
raw_segments = [seg.strip() for seg in body_text.split("|")]
|
|
bar_contents = [seg for seg in raw_segments if seg]
|
|
|
|
if not bar_contents:
|
|
raise ChordFormatError(f"{fname}: no bars found in body")
|
|
|
|
expected = _expected_positions(raw_time, subdivision)
|
|
bars: list[list[str]] = []
|
|
|
|
for bar_no, content in enumerate(bar_contents, start=1):
|
|
positions = content.split()
|
|
if len(positions) != expected:
|
|
raise ChordFormatError(
|
|
f"{fname}, bar {bar_no}: expected {expected} positions,"
|
|
f" got {len(positions)}"
|
|
)
|
|
for pos_no, token in enumerate(positions, start=1):
|
|
if token in (".", "NC", "?"):
|
|
continue
|
|
try:
|
|
parse_chord_symbol(token)
|
|
except ChordParseError as exc:
|
|
raise ChordFormatError(
|
|
f"{fname}, bar {bar_no}, pos {pos_no}: {exc}"
|
|
) from exc
|
|
bars.append(positions)
|
|
|
|
return ChordPeriod(
|
|
title=header["title"],
|
|
key=key,
|
|
time=raw_time,
|
|
subdivision=subdivision,
|
|
style=style,
|
|
function=function,
|
|
bars=bars,
|
|
)
|
|
|
|
|
|
def transpose_to_canonical(period: ChordPeriod) -> ChordPeriod:
|
|
"""Transpose a period to C major (major) or A minor (minor).
|
|
|
|
Args:
|
|
period: A ChordPeriod as returned by parse_chord_file.
|
|
|
|
Returns:
|
|
A new ChordPeriod with all chord roots and bass notes transposed
|
|
and the 'key' field updated to 'C_major' or 'A_minor'.
|
|
Returns the original object unchanged when it is already canonical.
|
|
|
|
Raises:
|
|
ChordFormatError: If the key field is malformed.
|
|
"""
|
|
key = period.key
|
|
parts = key.split("_")
|
|
if len(parts) < 2 or parts[-1] not in ("major", "minor"):
|
|
raise ChordFormatError(f"invalid key: {key!r}")
|
|
|
|
try:
|
|
tonic = _parse_note_from_key(parts[0])
|
|
except ValueError as exc:
|
|
raise ChordFormatError(f"invalid key tonic: {parts[0]!r}") from exc
|
|
|
|
mode = parts[-1]
|
|
canonical_index = 0 if mode == "major" else 9 # C = 0, A = 9
|
|
shift = (canonical_index - _NOTE_INDEX[tonic]) % 12
|
|
|
|
if shift == 0:
|
|
return period # already canonical
|
|
|
|
fname = "<transposition>"
|
|
new_bars: list[list[str]] = [
|
|
[_transpose_symbol(sym, shift, fname, bar_no)
|
|
for sym in bar]
|
|
for bar_no, bar in enumerate(period.bars, start=1)
|
|
]
|
|
|
|
canonical_key = "C_major" if mode == "major" else "A_minor"
|
|
return replace(period, key=canonical_key, bars=new_bars)
|
|
|
|
|
|
def tokenize_period(period: ChordPeriod) -> list[int]:
|
|
"""Transpose a period to canonical key and encode it as a token ID sequence.
|
|
|
|
Args:
|
|
period: A ChordPeriod as returned by parse_chord_file.
|
|
|
|
Returns:
|
|
List of integer token IDs: <BOS>, metadata tokens, per-bar chord
|
|
tokens interleaved with HOLD/NC, each bar closed by BAR, then <EOS>.
|
|
|
|
Raises:
|
|
ChordFormatError: If a chord symbol cannot be parsed during transposition.
|
|
"""
|
|
p = transpose_to_canonical(period)
|
|
mode = "major" if p.key == "C_major" else "minor"
|
|
|
|
ids: list[int] = [TOKEN_TO_ID["<BOS>"]]
|
|
ids.append(TOKEN_TO_ID[f"MODE_{mode}"])
|
|
ids.append(TOKEN_TO_ID[f"TIME_{p.time}"])
|
|
ids.append(TOKEN_TO_ID[f"SUB_{p.subdivision}"])
|
|
style_token = f"STYLE_{p.style}"
|
|
if style_token not in TOKEN_TO_ID:
|
|
log.warning("unknown style %r — mapping to STYLE_other", p.style)
|
|
style_token = "STYLE_other"
|
|
ids.append(TOKEN_TO_ID[style_token])
|
|
ids.append(TOKEN_TO_ID[f"FUNC_{p.function}"])
|
|
|
|
for bar in p.bars:
|
|
for pos in bar:
|
|
if pos == ".":
|
|
ids.append(TOKEN_TO_ID["HOLD"])
|
|
elif pos == "NC":
|
|
ids.append(TOKEN_TO_ID["NC"])
|
|
elif pos == "?":
|
|
ids.append(TOKEN_TO_ID["<UNK>"])
|
|
else:
|
|
t = parse_chord_symbol(pos)
|
|
ids.append(TOKEN_TO_ID[f"ROOT_{t.root}"])
|
|
ids.append(TOKEN_TO_ID[_qual_token(t.quality)])
|
|
ids.append(TOKEN_TO_ID[f"EXT_{t.extension}"])
|
|
ids.append(TOKEN_TO_ID[f"BASS_{t.bass}"])
|
|
ids.append(TOKEN_TO_ID["BAR"])
|
|
|
|
ids.append(TOKEN_TO_ID["<EOS>"])
|
|
return ids
|
|
|
|
|
|
def detokenize_to_period(token_ids: list[int]) -> ChordPeriod:
|
|
"""Convert a token ID sequence back to a ChordPeriod in canonical key (C/Am).
|
|
|
|
Args:
|
|
token_ids: Sequence produced by tokenize_period.
|
|
|
|
Returns:
|
|
ChordPeriod with key='C_major' or 'A_minor', title='detokenized'.
|
|
|
|
Raises:
|
|
ChordFormatError: If the sequence is structurally malformed.
|
|
"""
|
|
tokens = [ID_TO_TOKEN[i] for i in token_ids]
|
|
n = len(tokens)
|
|
idx = 0
|
|
|
|
def _consume(prefix: str) -> str:
|
|
nonlocal idx
|
|
if idx >= n:
|
|
raise ChordFormatError(
|
|
f"unexpected end of token sequence; expected '{prefix}...'"
|
|
)
|
|
tok = tokens[idx]
|
|
if not tok.startswith(prefix):
|
|
raise ChordFormatError(
|
|
f"expected token starting with '{prefix}', got {tok!r} at position {idx}"
|
|
)
|
|
idx += 1
|
|
return tok[len(prefix):]
|
|
|
|
if not tokens or tokens[0] != "<BOS>":
|
|
got = repr(tokens[0]) if tokens else "empty sequence"
|
|
raise ChordFormatError(f"token sequence must start with <BOS>, got {got}")
|
|
idx += 1
|
|
|
|
mode = _consume("MODE_")
|
|
time = _consume("TIME_")
|
|
subdivision = int(_consume("SUB_"))
|
|
style = _consume("STYLE_")
|
|
function = _consume("FUNC_")
|
|
|
|
key = "C_major" if mode == "major" else "A_minor"
|
|
|
|
bars: list[list[str]] = []
|
|
current_bar: list[str] = []
|
|
|
|
while idx < n:
|
|
tok = tokens[idx]
|
|
idx += 1
|
|
|
|
if tok == "<EOS>":
|
|
break
|
|
elif tok == "BAR":
|
|
bars.append(current_bar)
|
|
current_bar = []
|
|
elif tok == "HOLD":
|
|
current_bar.append(".")
|
|
elif tok == "NC":
|
|
current_bar.append("NC")
|
|
elif tok == "<UNK>":
|
|
current_bar.append("?")
|
|
elif tok.startswith("ROOT_"):
|
|
if idx + 3 > n:
|
|
raise ChordFormatError(
|
|
"incomplete chord token group near end of sequence"
|
|
)
|
|
qual_tok = tokens[idx]; idx += 1
|
|
ext_tok = tokens[idx]; idx += 1
|
|
bass_tok = tokens[idx]; idx += 1
|
|
root = tok[5:] # strip "ROOT_"
|
|
quality = _token_qual(qual_tok)
|
|
extension = ext_tok[4:] # strip "EXT_"
|
|
bass = bass_tok[5:] # strip "BASS_"
|
|
current_bar.append(
|
|
_tokens_to_symbol(ChordTokens(root, quality, extension, bass))
|
|
)
|
|
else:
|
|
raise ChordFormatError(f"unexpected token in bar body: {tok!r}")
|
|
|
|
if current_bar:
|
|
raise ChordFormatError(
|
|
"token sequence ended without closing BAR before <EOS>"
|
|
)
|
|
|
|
return ChordPeriod(
|
|
title="detokenized",
|
|
key=key,
|
|
time=time,
|
|
subdivision=subdivision,
|
|
style=style,
|
|
function=function,
|
|
bars=bars,
|
|
)
|