feat: implement McGill Billboard converter (Harte → .chord)

Adds src/external_converters/mcgill_to_chord.py with two public functions: - convert_song(song_dir, output_dir) — converts one salami_chords.txt to per-section .chord files (4–16 bars each, style=other) - convert_dataset(dataset_dir, output_dir) — batch converts all songs Key decisions: - Harte qualities mapped to our 18-quality vocabulary; hdim7 → m7b5, parenthetical alterations (e.g. 7(b9)) handled via regex - Bar duration estimated from median non-trivial chord duration - Mode (major/minor) inferred from tonic chord quality distribution - Sections with <4 or >16 bars are skipped with a logged reason - Unrecognized Harte chords skip the whole section (no silent corruption) 48 new tests in tests/test_mcgill_converter.py; total suite 223 passed. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-19 17:04:02 +03:00
parent 54be1be9ce
commit ea32bf43b2
3 changed files with 922 additions and 0 deletions
@@ -0,0 +1,642 @@
+"""Convert McGill Billboard dataset (salami_chords.txt) to .chord files.
+
+McGill Billboard format:
+  Each song is a subdirectory (e.g. 0003/, 0004/) containing salami_chords.txt.
+  The file has a header (# key: value) followed by tab-separated data lines:
+      <timestamp>\\t<section_label>\\t<chord>
+
+  Section labels: 'Z' (silence/boundary), a letter (e.g. 'A', 'B,verse'), or '.' (continuation).
+  Chords: Harte notation (e.g. C:maj, Bb:min7, N for no chord, X for unknown).
+
+Public API:
+    convert_dataset(dataset_dir, output_dir)  -- convert entire dataset directory
+    convert_song(song_dir, output_dir)        -- convert one song directory
+
+CLI:
+    python -m src.external_converters.mcgill_to_chord <dataset_dir> [--out <output_dir>]
+
+Example:
+    python -m src.external_converters.mcgill_to_chord data/raw_external/mcgill/ \\
+           --out data/raw_external/mcgill_converted/
+"""
+
+from __future__ import annotations
+
+import argparse
+import logging
+import re
+import statistics
+from collections import Counter
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional
+
+log = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Harte quality → (our_quality, our_extension)
+# ---------------------------------------------------------------------------
+
+_HARTE_QUALITY: dict[str, tuple[str, str]] = {
+    "maj":      ("maj",   "none"),
+    "min":      ("m",     "none"),
+    "dim":      ("dim",   "none"),
+    "aug":      ("aug",   "none"),
+    "sus4":     ("sus4",  "none"),
+    "sus2":     ("sus2",  "none"),
+    "maj7":     ("maj7",  "none"),
+    "min7":     ("m7",    "none"),
+    "7":        ("7",     "none"),
+    "hdim7":    ("m7b5",  "none"),
+    "dim7":     ("dim7",  "none"),
+    "minmaj7":  ("mM7",   "none"),
+    "maj6":     ("6",     "none"),
+    "min6":     ("m6",    "none"),
+    "6":        ("6",     "none"),
+    "7sus4":    ("7sus4", "none"),
+    "9":        ("7",     "9"),
+    "maj9":     ("maj7",  "9"),
+    "min9":     ("m7",    "9"),
+    "11":       ("7",     "11"),
+    "maj11":    ("maj7",  "11"),
+    "min11":    ("m7",    "11"),
+    "13":       ("7",     "13"),
+    "maj13":    ("maj7",  "13"),
+    "min13":    ("m7",    "13"),
+    "1":        ("maj",   "none"),  # root only → major
+    "5":        ("maj",   "none"),  # power chord → major (no 3rd)
+    "":         ("maj",   "none"),  # bare root
+}
+
+# Parenthetical alterations in Harte (e.g. '7(b9)') → our extension token
+_HARTE_PAREN_EXT: dict[str, str] = {
+    "b9":  "b9",
+    "#9":  "#9",
+    "#11": "#11",
+    "b13": "b13",
+    "13":  "13",
+    "11":  "11",
+    "9":   "9",
+}
+
+# McGill Billboard section function strings → our function tokens
+_FUNCTION_MAP: dict[str, str] = {
+    "intro":        "intro",
+    "verse":        "verse",
+    "pre-chorus":   "prechorus",
+    "pre_chorus":   "prechorus",
+    "prechorus":    "prechorus",
+    "pre":          "prechorus",
+    "chorus":       "chorus",
+    "refrain":      "chorus",
+    "bridge":       "bridge",
+    "outro":        "outro",
+    "coda":         "outro",
+    "end":          "outro",
+    "interlude":    "interlude",
+    "instrumental": "interlude",
+    "solo":         "interlude",
+    "break":        "other",
+    "transition":   "other",
+    "other":        "other",
+}
+
+_VALID_NOTES: frozenset[str] = frozenset(
+    {"C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"}
+)
+
+_FLAT_TO_SHARP: dict[str, str] = {
+    "Cb": "B",  "Db": "C#", "Eb": "D#", "Fb": "E",
+    "Gb": "F#", "Ab": "G#", "Bb": "A#",
+}
+
+_VALID_TIMES: frozenset[str] = frozenset({"4/4", "3/4", "6/8", "2/4", "12/8"})
+
+# Quality families used for mode inference
+_MAJOR_QUALITIES: frozenset[str] = frozenset(
+    {"maj", "maj7", "6", "add9", "aug", "sus2", "sus4", "7sus4", "aug7"}
+)
+_MINOR_QUALITIES: frozenset[str] = frozenset(
+    {"m", "m7", "mM7", "m6", "m7b5", "dim", "dim7"}
+)
+
+# ---------------------------------------------------------------------------
+# Internal data structures
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class _ChordEvent:
+    start: float
+    duration: float  # seconds
+    harte: str       # Harte chord string: 'N', 'X', 'C:maj', etc.
+
+
+@dataclass
+class _Section:
+    letter: str       # section letter, e.g. 'A', 'B'
+    function: str     # our function token, e.g. 'verse', 'chorus'
+    events: list[_ChordEvent] = field(default_factory=list)
+
+
+# ---------------------------------------------------------------------------
+# Note / chord helpers
+# ---------------------------------------------------------------------------
+
+
+def _normalize_note(raw: str) -> Optional[str]:
+    """Return sharp-canonical note name, or None if unrecognized."""
+    note = _FLAT_TO_SHARP.get(raw, raw)
+    return note if note in _VALID_NOTES else None
+
+
+def _harte_to_chord_symbol(harte: str) -> Optional[str]:
+    """Convert a Harte chord string to our .chord format symbol.
+
+    Args:
+        harte: Harte notation string, e.g. 'C:maj', 'Bb:min7', 'E:hdim7/G#'.
+
+    Returns:
+        Our chord symbol (e.g. 'Cmaj', 'A#m7', 'Em7b5/G#'), or None for
+        N (no chord), X (unknown), or any unparseable input.
+    """
+    harte = harte.strip()
+    if harte in ("N", "X", ""):
+        return None
+
+    # Extract slash bass note (rightmost '/')
+    bass_note = "root"
+    if "/" in harte:
+        main, bass_raw = harte.rsplit("/", 1)
+        if len(bass_raw) >= 2 and bass_raw[1] in "#b":
+            raw_b, tail = bass_raw[:2], bass_raw[2:]
+        else:
+            raw_b, tail = bass_raw[:1], bass_raw[1:]
+        if tail or not raw_b:
+            return None
+        bn = _normalize_note(raw_b)
+        if bn is None:
+            return None
+        bass_note = bn
+        harte = main
+
+    # Split root from quality on first ':'
+    if ":" in harte:
+        colon = harte.index(":")
+        root_str = harte[:colon]
+        quality_str = harte[colon + 1:]
+    else:
+        root_str = harte
+        quality_str = ""
+
+    # Parse root
+    if not root_str or root_str[0] not in "CDEFGAB":
+        return None
+    if len(root_str) >= 2 and root_str[1] in "#b":
+        raw_root, leftover = root_str[:2], root_str[2:]
+    else:
+        raw_root, leftover = root_str[:1], root_str[1:]
+    if leftover:
+        return None
+    root = _normalize_note(raw_root)
+    if root is None:
+        return None
+
+    # Parse quality — handle parenthetical alterations like '7(b9)'
+    m = re.match(r'^([^(]*)\(([^)]+)\)$', quality_str)
+    if m:
+        base_qual, alt = m.group(1), m.group(2)
+        base_result = _HARTE_QUALITY.get(base_qual)
+        if base_result is None:
+            return None
+        our_quality = base_result[0]
+        our_ext = _HARTE_PAREN_EXT.get(alt)
+        if our_ext is None:
+            return None
+    else:
+        result = _HARTE_QUALITY.get(quality_str)
+        if result is None:
+            return None
+        our_quality, our_ext = result
+
+    q_ext = our_quality + ("" if our_ext == "none" else our_ext)
+    bass_part = "" if bass_note == "root" else f"/{bass_note}"
+    return root + q_ext + bass_part
+
+
+# ---------------------------------------------------------------------------
+# File parsing
+# ---------------------------------------------------------------------------
+
+
+def _parse_salami_file(
+    path: Path,
+) -> tuple[dict[str, str], list[tuple[float, str, str]]]:
+    """Parse a salami_chords.txt file.
+
+    Returns:
+        (header, events) where header maps lowercase field names to values,
+        and events is a list of (timestamp, label, chord) triples.
+        label may be 'Z', a section letter (possibly with ',function'), or '.'.
+        chord is in Harte notation or '' when the column is absent.
+    """
+    header: dict[str, str] = {}
+    events: list[tuple[float, str, str]] = []
+
+    for raw in path.read_text(encoding="utf-8").splitlines():
+        line = raw.strip()
+        if not line:
+            continue
+        if line.startswith("#"):
+            if ":" in line:
+                content = line[1:].strip()
+                k, v = content.split(":", 1)
+                header[k.strip().lower()] = v.strip()
+            continue
+        parts = line.split("\t")
+        if len(parts) < 2:
+            continue
+        try:
+            ts = float(parts[0])
+        except ValueError:
+            continue
+        label = parts[1].strip()
+        chord = parts[2].strip() if len(parts) > 2 else ""
+        events.append((ts, label, chord))
+
+    return header, events
+
+
+# ---------------------------------------------------------------------------
+# Section extraction
+# ---------------------------------------------------------------------------
+
+
+def _parse_section_label(label: str) -> tuple[str, str]:
+    """Parse 'A,verse' → (letter='A', function='verse')."""
+    if "," in label:
+        letter, func_raw = label.split(",", 1)
+        func = _FUNCTION_MAP.get(func_raw.strip().lower(), "other")
+    else:
+        letter = label
+        func = "other"
+    return letter.strip(), func
+
+
+def _extract_sections(
+    events: list[tuple[float, str, str]],
+) -> list[_Section]:
+    """Group raw event triples into _Section objects with _ChordEvent lists."""
+    sections: list[_Section] = []
+    current: Optional[_Section] = None
+    timestamps = [e[0] for e in events]
+
+    for i, (ts, label, chord) in enumerate(events):
+        dur = timestamps[i + 1] - ts if i + 1 < len(timestamps) else 0.0
+
+        if label in ("Z", ""):
+            current = None
+            continue
+
+        if label == ".":
+            if current is not None and chord and dur > 0:
+                current.events.append(_ChordEvent(ts, dur, chord))
+            continue
+
+        # New section starts here
+        letter, func = _parse_section_label(label)
+        current = _Section(letter=letter, function=func)
+        sections.append(current)
+        if chord and dur > 0:
+            current.events.append(_ChordEvent(ts, dur, chord))
+
+    return sections
+
+
+# ---------------------------------------------------------------------------
+# Bar quantization
+# ---------------------------------------------------------------------------
+
+
+def _estimate_bar_duration(durations: list[float]) -> float:
+    """Estimate duration of one bar in seconds.
+
+    Uses the median of non-trivial chord durations as a proxy for one bar.
+    Clamped to [1.0, 5.0] s (covers ~48–240 BPM in 4/4).
+    Falls back to 2.0 s when fewer than 3 samples.
+    """
+    valid = [d for d in durations if d > 0.5]
+    if len(valid) < 3:
+        return 2.0
+    return max(1.0, min(5.0, statistics.median(valid)))
+
+
+def _expected_positions(time: str, subdivision: int) -> int:
+    """Number of positions per bar for the given time signature and subdivision."""
+    num, denom = (int(x) for x in time.split("/"))
+    return (num * subdivision) // denom
+
+
+def _section_to_bars(
+    section: _Section,
+    bar_duration: float,
+    time: str,
+    subdivision: int,
+) -> Optional[list[list[str]]]:
+    """Convert a section's chord events to a list of bars.
+
+    Returns None if any event contains an unrecognized Harte chord symbol;
+    the caller will skip the section and log a reason.
+    """
+    positions_per_bar = _expected_positions(time, subdivision)
+    bars: list[list[str]] = []
+
+    for event in section.events:
+        if event.harte == "N":
+            first_pos = "NC"
+        elif event.harte == "X":
+            first_pos = "?"
+        else:
+            sym = _harte_to_chord_symbol(event.harte)
+            if sym is None:
+                log.debug(
+                    "unrecognized Harte chord %r in section %s",
+                    event.harte, section.letter,
+                )
+                return None
+            first_pos = sym
+
+        n_bars = max(1, round(event.duration / bar_duration))
+        bars.append([first_pos] + ["."] * (positions_per_bar - 1))
+        for _ in range(n_bars - 1):
+            # Hold chord across additional bars
+            bars.append(["."] * positions_per_bar)
+
+    return bars
+
+
+# ---------------------------------------------------------------------------
+# Mode inference
+# ---------------------------------------------------------------------------
+
+
+def _infer_mode(tonic: str, sections: list[_Section]) -> str:
+    """Determine 'major' or 'minor' from tonic chord quality distribution.
+
+    Counts occurrences of the tonic root in major-family vs minor-family
+    qualities across all sections. Returns 'major' on a tie or no data.
+    """
+    major_count = 0
+    minor_count = 0
+
+    for section in sections:
+        for event in section.events:
+            if not event.harte or event.harte in ("N", "X"):
+                continue
+            # Extract root without a full Harte parse
+            colon = event.harte.find(":")
+            root_part = event.harte[:colon] if colon != -1 else event.harte
+            root_str = root_part.split("/")[0]
+            if len(root_str) >= 2 and root_str[1] in "#b":
+                raw_root = root_str[:2]
+            else:
+                raw_root = root_str[:1]
+            if not raw_root:
+                continue
+            root = _normalize_note(raw_root)
+            if root != tonic:
+                continue
+            # Extract quality
+            quality_str = event.harte[colon + 1:] if colon != -1 else ""
+            if "/" in quality_str:
+                quality_str = quality_str[: quality_str.index("/")]
+            base = re.sub(r'\([^)]*\)', "", quality_str).strip()
+            result = _HARTE_QUALITY.get(base)
+            if result is None:
+                continue
+            our_quality = result[0]
+            if our_quality in _MAJOR_QUALITIES:
+                major_count += 1
+            elif our_quality in _MINOR_QUALITIES:
+                minor_count += 1
+
+    return "minor" if minor_count > major_count else "major"
+
+
+# ---------------------------------------------------------------------------
+# Metre parsing
+# ---------------------------------------------------------------------------
+
+
+def _parse_metre(metre: str) -> tuple[Optional[str], int]:
+    """Parse metre string → (time_sig, subdivision). Returns (None, 0) if unsupported."""
+    m = metre.strip()
+    if m in _VALID_TIMES:
+        sub = 8 if m in ("6/8", "12/8") else 4
+        return m, sub
+    try:
+        mapping = {4: ("4/4", 4), 3: ("3/4", 4), 2: ("2/4", 4)}
+        return mapping.get(int(m), (None, 0))
+    except ValueError:
+        return None, 0
+
+
+# ---------------------------------------------------------------------------
+# File writing
+# ---------------------------------------------------------------------------
+
+
+def _write_chord_file(
+    path: Path,
+    title: str,
+    key: str,
+    time: str,
+    subdivision: int,
+    function: Optional[str],
+    bars: list[list[str]],
+) -> None:
+    """Write a harmonic period to a .chord file."""
+    lines = [
+        f"# title: {title}",
+        f"# key: {key}",
+        f"# time: {time}",
+        f"# subdivision: {subdivision}",
+        "# style: other",
+    ]
+    if function:
+        lines.append(f"# function: {function}")
+    lines.append("")  # blank line before body
+
+    for i in range(0, len(bars), 4):
+        chunk = bars[i : i + 4]
+        line = " ".join(f"| {' '.join(b)}" for b in chunk) + " |"
+        lines.append(line)
+
+    path.write_text("\n".join(lines) + "\n", encoding="utf-8")
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
+
+def convert_song(song_dir: Path, output_dir: Path) -> int:
+    """Convert one McGill Billboard song directory to .chord files.
+
+    Args:
+        song_dir:   Directory containing salami_chords.txt (e.g. 0003/).
+        output_dir: Destination directory for .chord files (created if absent).
+
+    Returns:
+        Number of .chord files successfully written.
+    """
+    salami = song_dir / "salami_chords.txt"
+    if not salami.exists():
+        log.warning("no salami_chords.txt in %s, skipping", song_dir)
+        return 0
+
+    try:
+        header, raw_events = _parse_salami_file(salami)
+    except Exception as exc:
+        log.error("failed to parse %s: %s", salami, exc)
+        return 0
+
+    song_id = song_dir.name
+
+    time_sig, subdivision = _parse_metre(header.get("metre", "4/4"))
+    if time_sig is None:
+        log.warning(
+            "unsupported metre %r in %s, skipping", header.get("metre"), song_dir
+        )
+        return 0
+
+    tonic_raw = header.get("tonic", "C").strip()
+    tonic = _normalize_note(tonic_raw) or "C"
+
+    sections = _extract_sections(raw_events)
+    if not sections:
+        log.warning("no sections found in %s", salami)
+        return 0
+
+    all_durations = [
+        e.duration
+        for s in sections
+        for e in s.events
+        if e.harte not in ("N", "X", "") and e.duration > 0.5
+    ]
+    bar_duration = _estimate_bar_duration(all_durations)
+    mode = _infer_mode(tonic, sections)
+    key = f"{tonic}_{mode}"
+
+    artist = header.get("artist", "unknown")
+    song_title = header.get("title", "unknown")
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+    n_saved = 0
+    skip_reasons: Counter[str] = Counter()
+
+    for idx, section in enumerate(sections):
+        bars = _section_to_bars(section, bar_duration, time_sig, subdivision)
+        if bars is None:
+            skip_reasons["unrecognized_chord"] += 1
+            continue
+
+        n = len(bars)
+        if n < 4:
+            log.debug(
+                "section %s in %s: %d bar(s) < 4, skipping",
+                section.letter, song_id, n,
+            )
+            skip_reasons["too_short"] += 1
+            continue
+        if n > 16:
+            log.debug(
+                "section %s in %s: %d bars > 16, skipping",
+                section.letter, song_id, n,
+            )
+            skip_reasons["too_long"] += 1
+            continue
+
+        func = section.function
+        filename = f"mcgill_{song_id}_{idx:02d}_{func}.chord"
+        out_path = output_dir / filename
+        period_title = f"{artist} - {song_title} ({section.letter},{func})"
+        _write_chord_file(
+            out_path, period_title, key, time_sig, subdivision,
+            func if func != "unspecified" else None, bars,
+        )
+        n_saved += 1
+        log.debug("wrote %s", out_path.name)
+
+    if skip_reasons:
+        log.info(
+            "song %s: saved=%d  skipped=%s", song_id, n_saved, dict(skip_reasons)
+        )
+    else:
+        log.info("song %s: saved %d period(s)", song_id, n_saved)
+
+    return n_saved
+
+
+def convert_dataset(dataset_dir: Path, output_dir: Path) -> tuple[int, int]:
+    """Convert all song directories in a McGill Billboard dataset.
+
+    Args:
+        dataset_dir: Root directory containing per-song subdirectories.
+        output_dir:  Destination directory for .chord files.
+
+    Returns:
+        (n_saved, n_empty) where n_empty counts songs that produced no output.
+    """
+    output_dir.mkdir(parents=True, exist_ok=True)
+    n_saved = 0
+    n_empty = 0
+
+    for song_dir in sorted(d for d in dataset_dir.iterdir() if d.is_dir()):
+        saved = convert_song(song_dir, output_dir)
+        n_saved += saved
+        if saved == 0:
+            n_empty += 1
+
+    log.info(
+        "conversion complete: %d periods saved, %d songs produced no output",
+        n_saved, n_empty,
+    )
+    return n_saved, n_empty
+
+
+# ---------------------------------------------------------------------------
+# CLI entry point
+# ---------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Convert McGill Billboard dataset to .chord files.",
+        epilog=(
+            "Example:\n"
+            "  python -m src.external_converters.mcgill_to_chord "
+            "data/raw_external/mcgill/ --out data/raw_external/mcgill_converted/"
+        ),
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "dataset_dir", type=Path, metavar="dataset_dir",
+        help="directory containing per-song subdirectories (0003/, 0004/, ...)",
+    )
+    parser.add_argument(
+        "--out", type=Path,
+        default=Path("data/raw_external/mcgill_converted"),
+        metavar="output_dir",
+        help="destination for .chord files (default: data/raw_external/mcgill_converted/)",
+    )
+    parser.add_argument(
+        "--log-level", default="INFO",
+        choices=["DEBUG", "INFO", "WARNING", "ERROR"],
+        help="logging verbosity (default: INFO)",
+    )
+    args = parser.parse_args()
+
+    logging.basicConfig(level=getattr(logging, args.log_level), format="%(message)s")
+    n_saved, n_empty = convert_dataset(args.dataset_dir, args.out)
+    print(f"Saved {n_saved} periods. {n_empty} song(s) produced no output.")
+    print(f"Output: {args.out}")