"""Parser, transposer, and tokenizer for .chord files. Public API: parse_chord_file(path: Path) -> ChordPeriod transpose_to_canonical(period: ChordPeriod) -> ChordPeriod tokenize_period(period: ChordPeriod) -> list[int] detokenize_to_period(token_ids: list[int]) -> ChordPeriod Vocabulary constants: VOCAB -- 81-token ordered list; index == token ID TOKEN_TO_ID -- {token_string: id} ID_TO_TOKEN -- alias for VOCAB See docs/chord_format_spec.md §5.2 for the vocabulary specification. """ from __future__ import annotations import logging import re from dataclasses import dataclass, replace from pathlib import Path from src.chord_parser import ChordParseError, ChordTokens, parse_chord_symbol log = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Exceptions # --------------------------------------------------------------------------- class ChordFormatError(ValueError): """Raised on a structural error in a .chord file.""" # --------------------------------------------------------------------------- # Data model # --------------------------------------------------------------------------- @dataclass class ChordPeriod: """One harmonic period parsed from a .chord file.""" title: str key: str # e.g. 'F#_major', 'B_minor' time: str # e.g. '4/4', '3/4', '6/8' subdivision: int # 4 or 8 style: str function: str # 'unspecified' when the header field is absent bars: list[list[str]] # bars[bar][pos] = chord symbol | '.' | 'NC' | '?' # --------------------------------------------------------------------------- # Note tables shared with transposition logic # --------------------------------------------------------------------------- _CHROMATIC: list[str] = [ "C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B" ] _NOTE_INDEX: dict[str, int] = {n: i for i, n in enumerate(_CHROMATIC)} _FLAT_TO_SHARP: dict[str, str] = { "Cb": "B", "Db": "C#", "Eb": "D#", "Fb": "E", "Gb": "F#", "Ab": "G#", "Bb": "A#", } _VALID_TIMES: frozenset[str] = frozenset({"4/4", "3/4", "6/8", "2/4", "12/8"}) _VALID_FUNCTIONS: frozenset[str] = frozenset({ "verse", "prechorus", "chorus", "bridge", "intro", "outro", "interlude", "other", }) # --------------------------------------------------------------------------- # Token vocabulary (§5.2) # --------------------------------------------------------------------------- VOCAB: list[str] = [ # Special (4) "", "", "", "", # Mode (2) "MODE_major", "MODE_minor", # Time signature (5) "TIME_4/4", "TIME_3/4", "TIME_6/8", "TIME_2/4", "TIME_12/8", # Subdivision (2) "SUB_4", "SUB_8", # Style (5) "STYLE_H1K0", "STYLE_jpop", "STYLE_classical", "STYLE_jazz", "STYLE_other", # Function (9) "FUNC_verse", "FUNC_prechorus", "FUNC_chorus", "FUNC_bridge", "FUNC_intro", "FUNC_outro", "FUNC_interlude", "FUNC_other", "FUNC_unspecified", # Chord root — 12 pitch classes, sharps only (12) "ROOT_C", "ROOT_C#", "ROOT_D", "ROOT_D#", "ROOT_E", "ROOT_F", "ROOT_F#", "ROOT_G", "ROOT_G#", "ROOT_A", "ROOT_A#", "ROOT_B", # Chord quality (18) "QUAL_maj", "QUAL_m", "QUAL_dim", "QUAL_aug", "QUAL_sus2", "QUAL_sus4", "QUAL_maj7", "QUAL_m7", "QUAL_7", "QUAL_m7b5", "QUAL_dim7", "QUAL_mM7", "QUAL_7sus4", "QUAL_aug7", "QUAL_6", "QUAL_m6", "QUAL_add9", "QUAL_m_add9", # Extension (8) "EXT_none", "EXT_9", "EXT_b9", "EXT_#9", "EXT_11", "EXT_#11", "EXT_13", "EXT_b13", # Bass note — 'root' sentinel + 12 pitch classes (13) "BASS_root", "BASS_C", "BASS_C#", "BASS_D", "BASS_D#", "BASS_E", "BASS_F", "BASS_F#", "BASS_G", "BASS_G#", "BASS_A", "BASS_A#", "BASS_B", # Structural (3) "HOLD", "NC", "BAR", ] TOKEN_TO_ID: dict[str, int] = {tok: i for i, tok in enumerate(VOCAB)} ID_TO_TOKEN: list[str] = VOCAB # --------------------------------------------------------------------------- # Internal helpers # --------------------------------------------------------------------------- def _normalize_note(raw: str) -> str: note = _FLAT_TO_SHARP.get(raw, raw) if note not in _NOTE_INDEX: raise ValueError(f"invalid note: {raw!r}") return note def _parse_note_from_key(s: str) -> str: """Parse the tonic note from a key string fragment (e.g. 'F#', 'Bb').""" s = s.strip() if not s or s[0] not in "CDEFGAB": raise ValueError(f"invalid note: {s!r}") if len(s) >= 2 and s[1] in "#b": return _normalize_note(s[:2]) return _normalize_note(s[0]) def _expected_positions(time: str, subdivision: int) -> int: """Number of positions per bar for the given time signature and subdivision.""" num, denom = (int(x) for x in time.split("/")) return (num * subdivision) // denom def _tokens_to_symbol(t: ChordTokens) -> str: """Reconstruct a canonical, parseable chord symbol string from ChordTokens.""" quality_ext = t.quality + ("" if t.extension == "none" else t.extension) bass_part = "" if t.bass == "root" else f"/{t.bass}" return t.root + quality_ext + bass_part def _transpose_note(note: str, shift: int) -> str: return _CHROMATIC[(_NOTE_INDEX[note] + shift) % 12] def _transpose_symbol(symbol: str, shift: int, fname: str, bar_no: int) -> str: """Transpose one position token by *shift* semitones. Structural tokens ('.', 'NC', '?') pass through unchanged. """ if symbol in (".", "NC", "?"): return symbol try: t = parse_chord_symbol(symbol) except ChordParseError as exc: raise ChordFormatError(f"{fname}, bar {bar_no}: {exc}") from exc new_root = _transpose_note(t.root, shift) new_bass = "root" if t.bass == "root" else _transpose_note(t.bass, shift) return _tokens_to_symbol(ChordTokens(new_root, t.quality, t.extension, new_bass)) def _qual_token(quality: str) -> str: """Map canonical quality string → QUAL_x token name.""" return "QUAL_m_add9" if quality == "m(add9)" else f"QUAL_{quality}" def _token_qual(token: str) -> str: """Map QUAL_x token name → canonical quality string.""" suffix = token[5:] # strip "QUAL_" return "m(add9)" if suffix == "m_add9" else suffix # --------------------------------------------------------------------------- # Public API # --------------------------------------------------------------------------- def parse_chord_file(path: Path) -> ChordPeriod: """Parse a .chord file into a ChordPeriod. Args: path: Path to the .chord file (UTF-8 encoded). Returns: ChordPeriod with header metadata and a list of bars. Raises: ChordFormatError: On missing/invalid header fields, wrong bar position count, or unrecognised chord symbols. """ fname = path.name text = path.read_text(encoding="utf-8") header: dict[str, str] = {} body_lines: list[str] = [] for raw_line in text.splitlines(): # Strip inline // comments comment_pos = raw_line.find("//") line = raw_line[:comment_pos].rstrip() if comment_pos != -1 else raw_line.rstrip() if not line.strip(): continue if line.lstrip().startswith("#"): content = line.lstrip()[1:].strip() if ":" in content: k, v = content.split(":", 1) header[k.strip().lower()] = v.strip() else: body_lines.append(line) # --- Validate required header fields --- for req in ("title", "key", "time", "subdivision", "style"): if req not in header: raise ChordFormatError(f"{fname}: missing required header field '{req}'") raw_time = header["time"] if raw_time not in _VALID_TIMES: raise ChordFormatError(f"{fname}: invalid time signature '{raw_time}'") try: subdivision = int(header["subdivision"]) except ValueError: raise ChordFormatError(f"{fname}: subdivision must be an integer") if subdivision not in (4, 8): raise ChordFormatError( f"{fname}: subdivision must be 4 or 8, got {subdivision}" ) style = header["style"] if not re.match(r'^[A-Za-z][A-Za-z0-9_]*$', style): raise ChordFormatError( f"{fname}: invalid style '{style}' — must be a non-empty identifier" " ([A-Za-z][A-Za-z0-9_]*)" ) raw_function = header.get("function", "") if raw_function and raw_function not in _VALID_FUNCTIONS: raise ChordFormatError(f"{fname}: invalid function '{raw_function}'") function = raw_function if raw_function else "unspecified" key = header["key"] key_parts = key.split("_") if len(key_parts) < 2 or key_parts[-1] not in ("major", "minor"): raise ChordFormatError(f"{fname}: invalid key format '{key}'") # --- Parse bars from body --- # Join all body lines; split on '|'; non-empty segments are bar contents. body_text = " ".join(body_lines) raw_segments = [seg.strip() for seg in body_text.split("|")] bar_contents = [seg for seg in raw_segments if seg] if not bar_contents: raise ChordFormatError(f"{fname}: no bars found in body") expected = _expected_positions(raw_time, subdivision) bars: list[list[str]] = [] for bar_no, content in enumerate(bar_contents, start=1): positions = content.split() if len(positions) != expected: raise ChordFormatError( f"{fname}, bar {bar_no}: expected {expected} positions," f" got {len(positions)}" ) for pos_no, token in enumerate(positions, start=1): if token in (".", "NC", "?"): continue try: parse_chord_symbol(token) except ChordParseError as exc: raise ChordFormatError( f"{fname}, bar {bar_no}, pos {pos_no}: {exc}" ) from exc bars.append(positions) return ChordPeriod( title=header["title"], key=key, time=raw_time, subdivision=subdivision, style=style, function=function, bars=bars, ) def transpose_to_canonical(period: ChordPeriod) -> ChordPeriod: """Transpose a period to C major (major) or A minor (minor). Args: period: A ChordPeriod as returned by parse_chord_file. Returns: A new ChordPeriod with all chord roots and bass notes transposed and the 'key' field updated to 'C_major' or 'A_minor'. Returns the original object unchanged when it is already canonical. Raises: ChordFormatError: If the key field is malformed. """ key = period.key parts = key.split("_") if len(parts) < 2 or parts[-1] not in ("major", "minor"): raise ChordFormatError(f"invalid key: {key!r}") try: tonic = _parse_note_from_key(parts[0]) except ValueError as exc: raise ChordFormatError(f"invalid key tonic: {parts[0]!r}") from exc mode = parts[-1] canonical_index = 0 if mode == "major" else 9 # C = 0, A = 9 shift = (canonical_index - _NOTE_INDEX[tonic]) % 12 if shift == 0: return period # already canonical fname = "" new_bars: list[list[str]] = [ [_transpose_symbol(sym, shift, fname, bar_no) for sym in bar] for bar_no, bar in enumerate(period.bars, start=1) ] canonical_key = "C_major" if mode == "major" else "A_minor" return replace(period, key=canonical_key, bars=new_bars) def tokenize_period(period: ChordPeriod) -> list[int]: """Transpose a period to canonical key and encode it as a token ID sequence. Args: period: A ChordPeriod as returned by parse_chord_file. Returns: List of integer token IDs: , metadata tokens, per-bar chord tokens interleaved with HOLD/NC, each bar closed by BAR, then . Raises: ChordFormatError: If a chord symbol cannot be parsed during transposition. """ p = transpose_to_canonical(period) mode = "major" if p.key == "C_major" else "minor" ids: list[int] = [TOKEN_TO_ID[""]] ids.append(TOKEN_TO_ID[f"MODE_{mode}"]) ids.append(TOKEN_TO_ID[f"TIME_{p.time}"]) ids.append(TOKEN_TO_ID[f"SUB_{p.subdivision}"]) style_token = f"STYLE_{p.style}" if style_token not in TOKEN_TO_ID: log.warning("unknown style %r — mapping to STYLE_other", p.style) style_token = "STYLE_other" ids.append(TOKEN_TO_ID[style_token]) ids.append(TOKEN_TO_ID[f"FUNC_{p.function}"]) for bar in p.bars: for pos in bar: if pos == ".": ids.append(TOKEN_TO_ID["HOLD"]) elif pos == "NC": ids.append(TOKEN_TO_ID["NC"]) elif pos == "?": ids.append(TOKEN_TO_ID[""]) else: t = parse_chord_symbol(pos) ids.append(TOKEN_TO_ID[f"ROOT_{t.root}"]) ids.append(TOKEN_TO_ID[_qual_token(t.quality)]) ids.append(TOKEN_TO_ID[f"EXT_{t.extension}"]) ids.append(TOKEN_TO_ID[f"BASS_{t.bass}"]) ids.append(TOKEN_TO_ID["BAR"]) ids.append(TOKEN_TO_ID[""]) return ids def detokenize_to_period(token_ids: list[int]) -> ChordPeriod: """Convert a token ID sequence back to a ChordPeriod in canonical key (C/Am). Args: token_ids: Sequence produced by tokenize_period. Returns: ChordPeriod with key='C_major' or 'A_minor', title='detokenized'. Raises: ChordFormatError: If the sequence is structurally malformed. """ tokens = [ID_TO_TOKEN[i] for i in token_ids] n = len(tokens) idx = 0 def _consume(prefix: str) -> str: nonlocal idx if idx >= n: raise ChordFormatError( f"unexpected end of token sequence; expected '{prefix}...'" ) tok = tokens[idx] if not tok.startswith(prefix): raise ChordFormatError( f"expected token starting with '{prefix}', got {tok!r} at position {idx}" ) idx += 1 return tok[len(prefix):] if not tokens or tokens[0] != "": got = repr(tokens[0]) if tokens else "empty sequence" raise ChordFormatError(f"token sequence must start with , got {got}") idx += 1 mode = _consume("MODE_") time = _consume("TIME_") subdivision = int(_consume("SUB_")) style = _consume("STYLE_") function = _consume("FUNC_") key = "C_major" if mode == "major" else "A_minor" bars: list[list[str]] = [] current_bar: list[str] = [] while idx < n: tok = tokens[idx] idx += 1 if tok == "": break elif tok == "BAR": bars.append(current_bar) current_bar = [] elif tok == "HOLD": current_bar.append(".") elif tok == "NC": current_bar.append("NC") elif tok == "": current_bar.append("?") elif tok.startswith("ROOT_"): if idx + 3 > n: raise ChordFormatError( "incomplete chord token group near end of sequence" ) qual_tok = tokens[idx]; idx += 1 ext_tok = tokens[idx]; idx += 1 bass_tok = tokens[idx]; idx += 1 root = tok[5:] # strip "ROOT_" quality = _token_qual(qual_tok) extension = ext_tok[4:] # strip "EXT_" bass = bass_tok[5:] # strip "BASS_" current_bar.append( _tokens_to_symbol(ChordTokens(root, quality, extension, bass)) ) else: raise ChordFormatError(f"unexpected token in bar body: {tok!r}") if current_bar: raise ChordFormatError( "token sequence ended without closing BAR before " ) return ChordPeriod( title="detokenized", key=key, time=time, subdivision=subdivision, style=style, function=function, bars=bars, )