feat: implement McGill Billboard converter (Harte → .chord)

Adds src/external_converters/mcgill_to_chord.py with two public functions:
  - convert_song(song_dir, output_dir) — converts one salami_chords.txt to
    per-section .chord files (4–16 bars each, style=other)
  - convert_dataset(dataset_dir, output_dir) — batch converts all songs

Key decisions:
  - Harte qualities mapped to our 18-quality vocabulary; hdim7 → m7b5,
    parenthetical alterations (e.g. 7(b9)) handled via regex
  - Bar duration estimated from median non-trivial chord duration
  - Mode (major/minor) inferred from tonic chord quality distribution
  - Sections with <4 or >16 bars are skipped with a logged reason
  - Unrecognized Harte chords skip the whole section (no silent corruption)

48 new tests in tests/test_mcgill_converter.py; total suite 223 passed.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-19 17:04:02 +03:00
parent 54be1be9ce
commit ea32bf43b2
3 changed files with 922 additions and 0 deletions
+642
View File
@@ -0,0 +1,642 @@
"""Convert McGill Billboard dataset (salami_chords.txt) to .chord files.
McGill Billboard format:
Each song is a subdirectory (e.g. 0003/, 0004/) containing salami_chords.txt.
The file has a header (# key: value) followed by tab-separated data lines:
<timestamp>\\t<section_label>\\t<chord>
Section labels: 'Z' (silence/boundary), a letter (e.g. 'A', 'B,verse'), or '.' (continuation).
Chords: Harte notation (e.g. C:maj, Bb:min7, N for no chord, X for unknown).
Public API:
convert_dataset(dataset_dir, output_dir) -- convert entire dataset directory
convert_song(song_dir, output_dir) -- convert one song directory
CLI:
python -m src.external_converters.mcgill_to_chord <dataset_dir> [--out <output_dir>]
Example:
python -m src.external_converters.mcgill_to_chord data/raw_external/mcgill/ \\
--out data/raw_external/mcgill_converted/
"""
from __future__ import annotations
import argparse
import logging
import re
import statistics
from collections import Counter
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional
log = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Harte quality → (our_quality, our_extension)
# ---------------------------------------------------------------------------
_HARTE_QUALITY: dict[str, tuple[str, str]] = {
"maj": ("maj", "none"),
"min": ("m", "none"),
"dim": ("dim", "none"),
"aug": ("aug", "none"),
"sus4": ("sus4", "none"),
"sus2": ("sus2", "none"),
"maj7": ("maj7", "none"),
"min7": ("m7", "none"),
"7": ("7", "none"),
"hdim7": ("m7b5", "none"),
"dim7": ("dim7", "none"),
"minmaj7": ("mM7", "none"),
"maj6": ("6", "none"),
"min6": ("m6", "none"),
"6": ("6", "none"),
"7sus4": ("7sus4", "none"),
"9": ("7", "9"),
"maj9": ("maj7", "9"),
"min9": ("m7", "9"),
"11": ("7", "11"),
"maj11": ("maj7", "11"),
"min11": ("m7", "11"),
"13": ("7", "13"),
"maj13": ("maj7", "13"),
"min13": ("m7", "13"),
"1": ("maj", "none"), # root only → major
"5": ("maj", "none"), # power chord → major (no 3rd)
"": ("maj", "none"), # bare root
}
# Parenthetical alterations in Harte (e.g. '7(b9)') → our extension token
_HARTE_PAREN_EXT: dict[str, str] = {
"b9": "b9",
"#9": "#9",
"#11": "#11",
"b13": "b13",
"13": "13",
"11": "11",
"9": "9",
}
# McGill Billboard section function strings → our function tokens
_FUNCTION_MAP: dict[str, str] = {
"intro": "intro",
"verse": "verse",
"pre-chorus": "prechorus",
"pre_chorus": "prechorus",
"prechorus": "prechorus",
"pre": "prechorus",
"chorus": "chorus",
"refrain": "chorus",
"bridge": "bridge",
"outro": "outro",
"coda": "outro",
"end": "outro",
"interlude": "interlude",
"instrumental": "interlude",
"solo": "interlude",
"break": "other",
"transition": "other",
"other": "other",
}
_VALID_NOTES: frozenset[str] = frozenset(
{"C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"}
)
_FLAT_TO_SHARP: dict[str, str] = {
"Cb": "B", "Db": "C#", "Eb": "D#", "Fb": "E",
"Gb": "F#", "Ab": "G#", "Bb": "A#",
}
_VALID_TIMES: frozenset[str] = frozenset({"4/4", "3/4", "6/8", "2/4", "12/8"})
# Quality families used for mode inference
_MAJOR_QUALITIES: frozenset[str] = frozenset(
{"maj", "maj7", "6", "add9", "aug", "sus2", "sus4", "7sus4", "aug7"}
)
_MINOR_QUALITIES: frozenset[str] = frozenset(
{"m", "m7", "mM7", "m6", "m7b5", "dim", "dim7"}
)
# ---------------------------------------------------------------------------
# Internal data structures
# ---------------------------------------------------------------------------
@dataclass
class _ChordEvent:
start: float
duration: float # seconds
harte: str # Harte chord string: 'N', 'X', 'C:maj', etc.
@dataclass
class _Section:
letter: str # section letter, e.g. 'A', 'B'
function: str # our function token, e.g. 'verse', 'chorus'
events: list[_ChordEvent] = field(default_factory=list)
# ---------------------------------------------------------------------------
# Note / chord helpers
# ---------------------------------------------------------------------------
def _normalize_note(raw: str) -> Optional[str]:
"""Return sharp-canonical note name, or None if unrecognized."""
note = _FLAT_TO_SHARP.get(raw, raw)
return note if note in _VALID_NOTES else None
def _harte_to_chord_symbol(harte: str) -> Optional[str]:
"""Convert a Harte chord string to our .chord format symbol.
Args:
harte: Harte notation string, e.g. 'C:maj', 'Bb:min7', 'E:hdim7/G#'.
Returns:
Our chord symbol (e.g. 'Cmaj', 'A#m7', 'Em7b5/G#'), or None for
N (no chord), X (unknown), or any unparseable input.
"""
harte = harte.strip()
if harte in ("N", "X", ""):
return None
# Extract slash bass note (rightmost '/')
bass_note = "root"
if "/" in harte:
main, bass_raw = harte.rsplit("/", 1)
if len(bass_raw) >= 2 and bass_raw[1] in "#b":
raw_b, tail = bass_raw[:2], bass_raw[2:]
else:
raw_b, tail = bass_raw[:1], bass_raw[1:]
if tail or not raw_b:
return None
bn = _normalize_note(raw_b)
if bn is None:
return None
bass_note = bn
harte = main
# Split root from quality on first ':'
if ":" in harte:
colon = harte.index(":")
root_str = harte[:colon]
quality_str = harte[colon + 1:]
else:
root_str = harte
quality_str = ""
# Parse root
if not root_str or root_str[0] not in "CDEFGAB":
return None
if len(root_str) >= 2 and root_str[1] in "#b":
raw_root, leftover = root_str[:2], root_str[2:]
else:
raw_root, leftover = root_str[:1], root_str[1:]
if leftover:
return None
root = _normalize_note(raw_root)
if root is None:
return None
# Parse quality — handle parenthetical alterations like '7(b9)'
m = re.match(r'^([^(]*)\(([^)]+)\)$', quality_str)
if m:
base_qual, alt = m.group(1), m.group(2)
base_result = _HARTE_QUALITY.get(base_qual)
if base_result is None:
return None
our_quality = base_result[0]
our_ext = _HARTE_PAREN_EXT.get(alt)
if our_ext is None:
return None
else:
result = _HARTE_QUALITY.get(quality_str)
if result is None:
return None
our_quality, our_ext = result
q_ext = our_quality + ("" if our_ext == "none" else our_ext)
bass_part = "" if bass_note == "root" else f"/{bass_note}"
return root + q_ext + bass_part
# ---------------------------------------------------------------------------
# File parsing
# ---------------------------------------------------------------------------
def _parse_salami_file(
path: Path,
) -> tuple[dict[str, str], list[tuple[float, str, str]]]:
"""Parse a salami_chords.txt file.
Returns:
(header, events) where header maps lowercase field names to values,
and events is a list of (timestamp, label, chord) triples.
label may be 'Z', a section letter (possibly with ',function'), or '.'.
chord is in Harte notation or '' when the column is absent.
"""
header: dict[str, str] = {}
events: list[tuple[float, str, str]] = []
for raw in path.read_text(encoding="utf-8").splitlines():
line = raw.strip()
if not line:
continue
if line.startswith("#"):
if ":" in line:
content = line[1:].strip()
k, v = content.split(":", 1)
header[k.strip().lower()] = v.strip()
continue
parts = line.split("\t")
if len(parts) < 2:
continue
try:
ts = float(parts[0])
except ValueError:
continue
label = parts[1].strip()
chord = parts[2].strip() if len(parts) > 2 else ""
events.append((ts, label, chord))
return header, events
# ---------------------------------------------------------------------------
# Section extraction
# ---------------------------------------------------------------------------
def _parse_section_label(label: str) -> tuple[str, str]:
"""Parse 'A,verse' → (letter='A', function='verse')."""
if "," in label:
letter, func_raw = label.split(",", 1)
func = _FUNCTION_MAP.get(func_raw.strip().lower(), "other")
else:
letter = label
func = "other"
return letter.strip(), func
def _extract_sections(
events: list[tuple[float, str, str]],
) -> list[_Section]:
"""Group raw event triples into _Section objects with _ChordEvent lists."""
sections: list[_Section] = []
current: Optional[_Section] = None
timestamps = [e[0] for e in events]
for i, (ts, label, chord) in enumerate(events):
dur = timestamps[i + 1] - ts if i + 1 < len(timestamps) else 0.0
if label in ("Z", ""):
current = None
continue
if label == ".":
if current is not None and chord and dur > 0:
current.events.append(_ChordEvent(ts, dur, chord))
continue
# New section starts here
letter, func = _parse_section_label(label)
current = _Section(letter=letter, function=func)
sections.append(current)
if chord and dur > 0:
current.events.append(_ChordEvent(ts, dur, chord))
return sections
# ---------------------------------------------------------------------------
# Bar quantization
# ---------------------------------------------------------------------------
def _estimate_bar_duration(durations: list[float]) -> float:
"""Estimate duration of one bar in seconds.
Uses the median of non-trivial chord durations as a proxy for one bar.
Clamped to [1.0, 5.0] s (covers ~48240 BPM in 4/4).
Falls back to 2.0 s when fewer than 3 samples.
"""
valid = [d for d in durations if d > 0.5]
if len(valid) < 3:
return 2.0
return max(1.0, min(5.0, statistics.median(valid)))
def _expected_positions(time: str, subdivision: int) -> int:
"""Number of positions per bar for the given time signature and subdivision."""
num, denom = (int(x) for x in time.split("/"))
return (num * subdivision) // denom
def _section_to_bars(
section: _Section,
bar_duration: float,
time: str,
subdivision: int,
) -> Optional[list[list[str]]]:
"""Convert a section's chord events to a list of bars.
Returns None if any event contains an unrecognized Harte chord symbol;
the caller will skip the section and log a reason.
"""
positions_per_bar = _expected_positions(time, subdivision)
bars: list[list[str]] = []
for event in section.events:
if event.harte == "N":
first_pos = "NC"
elif event.harte == "X":
first_pos = "?"
else:
sym = _harte_to_chord_symbol(event.harte)
if sym is None:
log.debug(
"unrecognized Harte chord %r in section %s",
event.harte, section.letter,
)
return None
first_pos = sym
n_bars = max(1, round(event.duration / bar_duration))
bars.append([first_pos] + ["."] * (positions_per_bar - 1))
for _ in range(n_bars - 1):
# Hold chord across additional bars
bars.append(["."] * positions_per_bar)
return bars
# ---------------------------------------------------------------------------
# Mode inference
# ---------------------------------------------------------------------------
def _infer_mode(tonic: str, sections: list[_Section]) -> str:
"""Determine 'major' or 'minor' from tonic chord quality distribution.
Counts occurrences of the tonic root in major-family vs minor-family
qualities across all sections. Returns 'major' on a tie or no data.
"""
major_count = 0
minor_count = 0
for section in sections:
for event in section.events:
if not event.harte or event.harte in ("N", "X"):
continue
# Extract root without a full Harte parse
colon = event.harte.find(":")
root_part = event.harte[:colon] if colon != -1 else event.harte
root_str = root_part.split("/")[0]
if len(root_str) >= 2 and root_str[1] in "#b":
raw_root = root_str[:2]
else:
raw_root = root_str[:1]
if not raw_root:
continue
root = _normalize_note(raw_root)
if root != tonic:
continue
# Extract quality
quality_str = event.harte[colon + 1:] if colon != -1 else ""
if "/" in quality_str:
quality_str = quality_str[: quality_str.index("/")]
base = re.sub(r'\([^)]*\)', "", quality_str).strip()
result = _HARTE_QUALITY.get(base)
if result is None:
continue
our_quality = result[0]
if our_quality in _MAJOR_QUALITIES:
major_count += 1
elif our_quality in _MINOR_QUALITIES:
minor_count += 1
return "minor" if minor_count > major_count else "major"
# ---------------------------------------------------------------------------
# Metre parsing
# ---------------------------------------------------------------------------
def _parse_metre(metre: str) -> tuple[Optional[str], int]:
"""Parse metre string → (time_sig, subdivision). Returns (None, 0) if unsupported."""
m = metre.strip()
if m in _VALID_TIMES:
sub = 8 if m in ("6/8", "12/8") else 4
return m, sub
try:
mapping = {4: ("4/4", 4), 3: ("3/4", 4), 2: ("2/4", 4)}
return mapping.get(int(m), (None, 0))
except ValueError:
return None, 0
# ---------------------------------------------------------------------------
# File writing
# ---------------------------------------------------------------------------
def _write_chord_file(
path: Path,
title: str,
key: str,
time: str,
subdivision: int,
function: Optional[str],
bars: list[list[str]],
) -> None:
"""Write a harmonic period to a .chord file."""
lines = [
f"# title: {title}",
f"# key: {key}",
f"# time: {time}",
f"# subdivision: {subdivision}",
"# style: other",
]
if function:
lines.append(f"# function: {function}")
lines.append("") # blank line before body
for i in range(0, len(bars), 4):
chunk = bars[i : i + 4]
line = " ".join(f"| {' '.join(b)}" for b in chunk) + " |"
lines.append(line)
path.write_text("\n".join(lines) + "\n", encoding="utf-8")
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def convert_song(song_dir: Path, output_dir: Path) -> int:
"""Convert one McGill Billboard song directory to .chord files.
Args:
song_dir: Directory containing salami_chords.txt (e.g. 0003/).
output_dir: Destination directory for .chord files (created if absent).
Returns:
Number of .chord files successfully written.
"""
salami = song_dir / "salami_chords.txt"
if not salami.exists():
log.warning("no salami_chords.txt in %s, skipping", song_dir)
return 0
try:
header, raw_events = _parse_salami_file(salami)
except Exception as exc:
log.error("failed to parse %s: %s", salami, exc)
return 0
song_id = song_dir.name
time_sig, subdivision = _parse_metre(header.get("metre", "4/4"))
if time_sig is None:
log.warning(
"unsupported metre %r in %s, skipping", header.get("metre"), song_dir
)
return 0
tonic_raw = header.get("tonic", "C").strip()
tonic = _normalize_note(tonic_raw) or "C"
sections = _extract_sections(raw_events)
if not sections:
log.warning("no sections found in %s", salami)
return 0
all_durations = [
e.duration
for s in sections
for e in s.events
if e.harte not in ("N", "X", "") and e.duration > 0.5
]
bar_duration = _estimate_bar_duration(all_durations)
mode = _infer_mode(tonic, sections)
key = f"{tonic}_{mode}"
artist = header.get("artist", "unknown")
song_title = header.get("title", "unknown")
output_dir.mkdir(parents=True, exist_ok=True)
n_saved = 0
skip_reasons: Counter[str] = Counter()
for idx, section in enumerate(sections):
bars = _section_to_bars(section, bar_duration, time_sig, subdivision)
if bars is None:
skip_reasons["unrecognized_chord"] += 1
continue
n = len(bars)
if n < 4:
log.debug(
"section %s in %s: %d bar(s) < 4, skipping",
section.letter, song_id, n,
)
skip_reasons["too_short"] += 1
continue
if n > 16:
log.debug(
"section %s in %s: %d bars > 16, skipping",
section.letter, song_id, n,
)
skip_reasons["too_long"] += 1
continue
func = section.function
filename = f"mcgill_{song_id}_{idx:02d}_{func}.chord"
out_path = output_dir / filename
period_title = f"{artist} - {song_title} ({section.letter},{func})"
_write_chord_file(
out_path, period_title, key, time_sig, subdivision,
func if func != "unspecified" else None, bars,
)
n_saved += 1
log.debug("wrote %s", out_path.name)
if skip_reasons:
log.info(
"song %s: saved=%d skipped=%s", song_id, n_saved, dict(skip_reasons)
)
else:
log.info("song %s: saved %d period(s)", song_id, n_saved)
return n_saved
def convert_dataset(dataset_dir: Path, output_dir: Path) -> tuple[int, int]:
"""Convert all song directories in a McGill Billboard dataset.
Args:
dataset_dir: Root directory containing per-song subdirectories.
output_dir: Destination directory for .chord files.
Returns:
(n_saved, n_empty) where n_empty counts songs that produced no output.
"""
output_dir.mkdir(parents=True, exist_ok=True)
n_saved = 0
n_empty = 0
for song_dir in sorted(d for d in dataset_dir.iterdir() if d.is_dir()):
saved = convert_song(song_dir, output_dir)
n_saved += saved
if saved == 0:
n_empty += 1
log.info(
"conversion complete: %d periods saved, %d songs produced no output",
n_saved, n_empty,
)
return n_saved, n_empty
# ---------------------------------------------------------------------------
# CLI entry point
# ---------------------------------------------------------------------------
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Convert McGill Billboard dataset to .chord files.",
epilog=(
"Example:\n"
" python -m src.external_converters.mcgill_to_chord "
"data/raw_external/mcgill/ --out data/raw_external/mcgill_converted/"
),
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"dataset_dir", type=Path, metavar="dataset_dir",
help="directory containing per-song subdirectories (0003/, 0004/, ...)",
)
parser.add_argument(
"--out", type=Path,
default=Path("data/raw_external/mcgill_converted"),
metavar="output_dir",
help="destination for .chord files (default: data/raw_external/mcgill_converted/)",
)
parser.add_argument(
"--log-level", default="INFO",
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
help="logging verbosity (default: INFO)",
)
args = parser.parse_args()
logging.basicConfig(level=getattr(logging, args.log_level), format="%(message)s")
n_saved, n_empty = convert_dataset(args.dataset_dir, args.out)
print(f"Saved {n_saved} periods. {n_empty} song(s) produced no output.")
print(f"Output: {args.out}")