hamori/src/chord_parser.py

"""Chord symbol parser for .chord files.

Parses a chord symbol string (e.g. 'Cmaj7', 'F#m7/A', 'Bb7b9/D') into a
ChordTokens dataclass.  See docs/chord_format_spec.md §4 for the full spec.

Usage:
    from src.chord_parser import parse_chord_symbol, ChordParseError
    tokens = parse_chord_symbol("Fmaj9")
    # ChordTokens(root='F', quality='maj7', extension='9', bass='root')
"""

from __future__ import annotations

from dataclasses import dataclass


class ChordParseError(ValueError):
    """Raised when a chord symbol cannot be parsed."""


@dataclass(frozen=True)
class ChordTokens:
    """Factorized chord representation — one string per token slot."""

    root: str       # one of the 12 chromatic pitch classes, e.g. 'C', 'F#', 'A#'
    quality: str    # one of the 18 canonical quality names, e.g. 'maj7', 'm', 'dim7'
    extension: str  # one of 8 values: 'none' or e.g. '9', 'b9', '#11'
    bass: str       # 'root' or one of the 12 pitch classes


# ---------------------------------------------------------------------------
# Lookup tables
# ---------------------------------------------------------------------------

_VALID_ROOTS: frozenset[str] = frozenset(
    {"C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"}
)

_FLAT_TO_SHARP: dict[str, str] = {
    "Cb": "B",
    "Db": "C#",
    "Eb": "D#",
    "Fb": "E",
    "Gb": "F#",
    "Ab": "G#",
    "Bb": "A#",
}

# Maps the quality+extension string (after the root, before any '/') to
# (canonical_quality, canonical_extension).  Covers:
#   - standalone qualities with all their alternative spellings
#   - shorthand expansions where a 7th is implied (e.g. 'maj9' → maj7 + 9)
#   - Unicode symbols (°, Δ, ø)
_QUAL_EXT_MAP: dict[str, tuple[str, str]] = {
    # empty root suffix → plain major
    "": ("maj", "none"),
    # --- major ---
    "maj": ("maj", "none"),
    "maj7": ("maj7", "none"),
    "M7": ("maj7", "none"),
    "Δ7": ("maj7", "none"),
    "Δ": ("maj7", "none"),
    "maj6": ("6", "none"),
    # major shorthands (dominant/major 7th implied by the extension numeral)
    "maj9": ("maj7", "9"),
    "maj11": ("maj7", "11"),
    "maj13": ("maj7", "13"),
    # --- minor ---
    "m": ("m", "none"),
    "min": ("m", "none"),
    "-": ("m", "none"),
    "m7": ("m7", "none"),
    "min7": ("m7", "none"),
    "-7": ("m7", "none"),
    # minor shorthands (m7 implied)
    "m9": ("m7", "9"),
    "min9": ("m7", "9"),
    "m11": ("m7", "11"),
    "min11": ("m7", "11"),
    "m13": ("m7", "13"),
    "min13": ("m7", "13"),
    # minor sixth
    "m6": ("m6", "none"),
    "min6": ("m6", "none"),
    # half-diminished
    "m7b5": ("m7b5", "none"),
    "min7b5": ("m7b5", "none"),
    "m7♭5": ("m7b5", "none"),
    "ø": ("m7b5", "none"),
    "ø7": ("m7b5", "none"),
    # minor-major seventh
    "mM7": ("mM7", "none"),
    "m(maj7)": ("mM7", "none"),
    "minMaj7": ("mM7", "none"),
    # minor add9
    "madd9": ("m(add9)", "none"),
    "m(add9)": ("m(add9)", "none"),
    "m(add2)": ("m(add9)", "none"),
    # --- dominant ---
    "7": ("7", "none"),
    # dominant shorthands (dominant 7th implied)
    "9": ("7", "9"),
    "11": ("7", "11"),
    "13": ("7", "13"),
    # --- diminished ---
    "dim": ("dim", "none"),
    "°": ("dim", "none"),
    "dim7": ("dim7", "none"),
    "°7": ("dim7", "none"),
    # --- augmented ---
    "aug": ("aug", "none"),
    "+": ("aug", "none"),
    "aug7": ("aug7", "none"),
    "+7": ("aug7", "none"),
    "7#5": ("aug7", "none"),
    # --- suspended ---
    "sus2": ("sus2", "none"),
    "sus4": ("sus4", "none"),
    "sus": ("sus4", "none"),
    "7sus4": ("7sus4", "none"),
    "7sus": ("7sus4", "none"),
    # --- sixth / add ---
    "6": ("6", "none"),
    "add9": ("add9", "none"),
    "2": ("add9", "none"),
}

# Maps quality-only strings (no extension) to canonical quality names.
# Used when an explicit extension suffix has been stripped from the end.
_QUAL_ONLY_MAP: dict[str, str] = {
    "": "maj",
    "maj": "maj",
    "maj7": "maj7",
    "M7": "maj7",
    "Δ7": "maj7",
    "Δ": "maj7",
    "maj6": "6",
    "m": "m",
    "min": "m",
    "-": "m",
    "m7": "m7",
    "min7": "m7",
    "-7": "m7",
    "m6": "m6",
    "min6": "m6",
    "m7b5": "m7b5",
    "ø": "m7b5",
    "ø7": "m7b5",
    "mM7": "mM7",
    "7": "7",
    "dim": "dim",
    "°": "dim",
    "dim7": "dim7",
    "°7": "dim7",
    "aug": "aug",
    "+": "aug",
    "aug7": "aug7",
    "+7": "aug7",
    "7#5": "aug7",
    "sus2": "sus2",
    "sus4": "sus4",
    "sus": "sus4",
    "7sus4": "7sus4",
    "7sus": "7sus4",
    "6": "6",
    "add9": "add9",
}

# Extension suffixes tried longest-first to avoid greedy ambiguity.
# (e.g. 'b13' must be tried before '13' so 'maj7b13' is not split at '13')
_EXT_SUFFIXES: tuple[str, ...] = ("b13", "#11", "b9", "#9", "13", "11", "9")


# ---------------------------------------------------------------------------
# Internal helpers
# ---------------------------------------------------------------------------


def _normalize_note(raw: str) -> str:
    note = _FLAT_TO_SHARP.get(raw, raw)
    if note not in _VALID_ROOTS:
        raise ChordParseError(f"invalid note: {raw!r}")
    return note


def _extract_root(s: str) -> tuple[str, str]:
    """Return (normalized_root, remaining_suffix)."""
    if not s or s[0] not in "CDEFGAB":
        raise ChordParseError(
            f"chord symbol must start with a note letter A–G: {s!r}"
        )
    if len(s) >= 2 and s[1] in "#b":
        raw_root, rest = s[:2], s[2:]
    else:
        raw_root, rest = s[0], s[1:]
    return _normalize_note(raw_root), rest


def _parse_bass(s: str) -> str:
    s = s.strip()
    if not s:
        raise ChordParseError("empty bass note after '/'")
    if s[0] not in "CDEFGAB":
        raise ChordParseError(f"invalid bass note: {s!r}")
    if len(s) >= 2 and s[1] in "#b":
        raw, tail = s[:2], s[2:]
    else:
        raw, tail = s[0], s[1:]
    if tail:
        raise ChordParseError(f"invalid bass note (trailing characters): {s!r}")
    return _normalize_note(raw)


def _parse_quality_ext(s: str) -> tuple[str, str]:
    """Return (canonical_quality, canonical_extension) for the suffix string."""
    # Direct lookup: handles standalone qualities, Unicode variants, shorthands.
    if s in _QUAL_EXT_MAP:
        return _QUAL_EXT_MAP[s]

    # Try stripping a known extension suffix from the right.
    for ext in _EXT_SUFFIXES:
        if s.endswith(ext):
            qual_s = s[: -len(ext)]
            if qual_s in _QUAL_ONLY_MAP:
                return _QUAL_ONLY_MAP[qual_s], ext

    raise ChordParseError(f"unrecognized quality/extension: {s!r}")


# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------


def parse_chord_symbol(symbol: str) -> ChordTokens:
    """Parse a chord symbol string into factorized token slots.

    Args:
        symbol: Chord symbol, e.g. 'Cmaj7', 'F#m7/A', 'Bb7b9/D'.

    Returns:
        ChordTokens(root, quality, extension, bass) with all values drawn
        from the vocabularies in docs/chord_format_spec.md §4.2–4.5.

    Raises:
        ChordParseError: If the symbol is empty, unrecognized, or malformed.
    """
    if not symbol or not symbol.strip():
        raise ChordParseError("empty chord symbol")

    symbol = symbol.strip()

    slash_count = symbol.count("/")
    if slash_count > 1:
        raise ChordParseError(f"multiple '/' in chord symbol: {symbol!r}")

    if slash_count == 1:
        chord_part, bass_str = symbol.split("/")
        bass = _parse_bass(bass_str)
    else:
        chord_part = symbol
        bass = "root"

    if not chord_part:
        raise ChordParseError(f"missing chord before '/': {symbol!r}")

    root, rest = _extract_root(chord_part)
    quality, extension = _parse_quality_ext(rest)

    return ChordTokens(root=root, quality=quality, extension=extension, bass=bass)