feat: implement McGill Billboard converter (Harte → .chord)
Adds src/external_converters/mcgill_to_chord.py with two public functions:
- convert_song(song_dir, output_dir) — converts one salami_chords.txt to
per-section .chord files (4–16 bars each, style=other)
- convert_dataset(dataset_dir, output_dir) — batch converts all songs
Key decisions:
- Harte qualities mapped to our 18-quality vocabulary; hdim7 → m7b5,
parenthetical alterations (e.g. 7(b9)) handled via regex
- Bar duration estimated from median non-trivial chord duration
- Mode (major/minor) inferred from tonic chord quality distribution
- Sections with <4 or >16 bars are skipped with a logged reason
- Unrecognized Harte chords skip the whole section (no silent corruption)
48 new tests in tests/test_mcgill_converter.py; total suite 223 passed.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,642 @@
|
||||
"""Convert McGill Billboard dataset (salami_chords.txt) to .chord files.
|
||||
|
||||
McGill Billboard format:
|
||||
Each song is a subdirectory (e.g. 0003/, 0004/) containing salami_chords.txt.
|
||||
The file has a header (# key: value) followed by tab-separated data lines:
|
||||
<timestamp>\\t<section_label>\\t<chord>
|
||||
|
||||
Section labels: 'Z' (silence/boundary), a letter (e.g. 'A', 'B,verse'), or '.' (continuation).
|
||||
Chords: Harte notation (e.g. C:maj, Bb:min7, N for no chord, X for unknown).
|
||||
|
||||
Public API:
|
||||
convert_dataset(dataset_dir, output_dir) -- convert entire dataset directory
|
||||
convert_song(song_dir, output_dir) -- convert one song directory
|
||||
|
||||
CLI:
|
||||
python -m src.external_converters.mcgill_to_chord <dataset_dir> [--out <output_dir>]
|
||||
|
||||
Example:
|
||||
python -m src.external_converters.mcgill_to_chord data/raw_external/mcgill/ \\
|
||||
--out data/raw_external/mcgill_converted/
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import re
|
||||
import statistics
|
||||
from collections import Counter
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Harte quality → (our_quality, our_extension)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_HARTE_QUALITY: dict[str, tuple[str, str]] = {
|
||||
"maj": ("maj", "none"),
|
||||
"min": ("m", "none"),
|
||||
"dim": ("dim", "none"),
|
||||
"aug": ("aug", "none"),
|
||||
"sus4": ("sus4", "none"),
|
||||
"sus2": ("sus2", "none"),
|
||||
"maj7": ("maj7", "none"),
|
||||
"min7": ("m7", "none"),
|
||||
"7": ("7", "none"),
|
||||
"hdim7": ("m7b5", "none"),
|
||||
"dim7": ("dim7", "none"),
|
||||
"minmaj7": ("mM7", "none"),
|
||||
"maj6": ("6", "none"),
|
||||
"min6": ("m6", "none"),
|
||||
"6": ("6", "none"),
|
||||
"7sus4": ("7sus4", "none"),
|
||||
"9": ("7", "9"),
|
||||
"maj9": ("maj7", "9"),
|
||||
"min9": ("m7", "9"),
|
||||
"11": ("7", "11"),
|
||||
"maj11": ("maj7", "11"),
|
||||
"min11": ("m7", "11"),
|
||||
"13": ("7", "13"),
|
||||
"maj13": ("maj7", "13"),
|
||||
"min13": ("m7", "13"),
|
||||
"1": ("maj", "none"), # root only → major
|
||||
"5": ("maj", "none"), # power chord → major (no 3rd)
|
||||
"": ("maj", "none"), # bare root
|
||||
}
|
||||
|
||||
# Parenthetical alterations in Harte (e.g. '7(b9)') → our extension token
|
||||
_HARTE_PAREN_EXT: dict[str, str] = {
|
||||
"b9": "b9",
|
||||
"#9": "#9",
|
||||
"#11": "#11",
|
||||
"b13": "b13",
|
||||
"13": "13",
|
||||
"11": "11",
|
||||
"9": "9",
|
||||
}
|
||||
|
||||
# McGill Billboard section function strings → our function tokens
|
||||
_FUNCTION_MAP: dict[str, str] = {
|
||||
"intro": "intro",
|
||||
"verse": "verse",
|
||||
"pre-chorus": "prechorus",
|
||||
"pre_chorus": "prechorus",
|
||||
"prechorus": "prechorus",
|
||||
"pre": "prechorus",
|
||||
"chorus": "chorus",
|
||||
"refrain": "chorus",
|
||||
"bridge": "bridge",
|
||||
"outro": "outro",
|
||||
"coda": "outro",
|
||||
"end": "outro",
|
||||
"interlude": "interlude",
|
||||
"instrumental": "interlude",
|
||||
"solo": "interlude",
|
||||
"break": "other",
|
||||
"transition": "other",
|
||||
"other": "other",
|
||||
}
|
||||
|
||||
_VALID_NOTES: frozenset[str] = frozenset(
|
||||
{"C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"}
|
||||
)
|
||||
|
||||
_FLAT_TO_SHARP: dict[str, str] = {
|
||||
"Cb": "B", "Db": "C#", "Eb": "D#", "Fb": "E",
|
||||
"Gb": "F#", "Ab": "G#", "Bb": "A#",
|
||||
}
|
||||
|
||||
_VALID_TIMES: frozenset[str] = frozenset({"4/4", "3/4", "6/8", "2/4", "12/8"})
|
||||
|
||||
# Quality families used for mode inference
|
||||
_MAJOR_QUALITIES: frozenset[str] = frozenset(
|
||||
{"maj", "maj7", "6", "add9", "aug", "sus2", "sus4", "7sus4", "aug7"}
|
||||
)
|
||||
_MINOR_QUALITIES: frozenset[str] = frozenset(
|
||||
{"m", "m7", "mM7", "m6", "m7b5", "dim", "dim7"}
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Internal data structures
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@dataclass
|
||||
class _ChordEvent:
|
||||
start: float
|
||||
duration: float # seconds
|
||||
harte: str # Harte chord string: 'N', 'X', 'C:maj', etc.
|
||||
|
||||
|
||||
@dataclass
|
||||
class _Section:
|
||||
letter: str # section letter, e.g. 'A', 'B'
|
||||
function: str # our function token, e.g. 'verse', 'chorus'
|
||||
events: list[_ChordEvent] = field(default_factory=list)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Note / chord helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _normalize_note(raw: str) -> Optional[str]:
|
||||
"""Return sharp-canonical note name, or None if unrecognized."""
|
||||
note = _FLAT_TO_SHARP.get(raw, raw)
|
||||
return note if note in _VALID_NOTES else None
|
||||
|
||||
|
||||
def _harte_to_chord_symbol(harte: str) -> Optional[str]:
|
||||
"""Convert a Harte chord string to our .chord format symbol.
|
||||
|
||||
Args:
|
||||
harte: Harte notation string, e.g. 'C:maj', 'Bb:min7', 'E:hdim7/G#'.
|
||||
|
||||
Returns:
|
||||
Our chord symbol (e.g. 'Cmaj', 'A#m7', 'Em7b5/G#'), or None for
|
||||
N (no chord), X (unknown), or any unparseable input.
|
||||
"""
|
||||
harte = harte.strip()
|
||||
if harte in ("N", "X", ""):
|
||||
return None
|
||||
|
||||
# Extract slash bass note (rightmost '/')
|
||||
bass_note = "root"
|
||||
if "/" in harte:
|
||||
main, bass_raw = harte.rsplit("/", 1)
|
||||
if len(bass_raw) >= 2 and bass_raw[1] in "#b":
|
||||
raw_b, tail = bass_raw[:2], bass_raw[2:]
|
||||
else:
|
||||
raw_b, tail = bass_raw[:1], bass_raw[1:]
|
||||
if tail or not raw_b:
|
||||
return None
|
||||
bn = _normalize_note(raw_b)
|
||||
if bn is None:
|
||||
return None
|
||||
bass_note = bn
|
||||
harte = main
|
||||
|
||||
# Split root from quality on first ':'
|
||||
if ":" in harte:
|
||||
colon = harte.index(":")
|
||||
root_str = harte[:colon]
|
||||
quality_str = harte[colon + 1:]
|
||||
else:
|
||||
root_str = harte
|
||||
quality_str = ""
|
||||
|
||||
# Parse root
|
||||
if not root_str or root_str[0] not in "CDEFGAB":
|
||||
return None
|
||||
if len(root_str) >= 2 and root_str[1] in "#b":
|
||||
raw_root, leftover = root_str[:2], root_str[2:]
|
||||
else:
|
||||
raw_root, leftover = root_str[:1], root_str[1:]
|
||||
if leftover:
|
||||
return None
|
||||
root = _normalize_note(raw_root)
|
||||
if root is None:
|
||||
return None
|
||||
|
||||
# Parse quality — handle parenthetical alterations like '7(b9)'
|
||||
m = re.match(r'^([^(]*)\(([^)]+)\)$', quality_str)
|
||||
if m:
|
||||
base_qual, alt = m.group(1), m.group(2)
|
||||
base_result = _HARTE_QUALITY.get(base_qual)
|
||||
if base_result is None:
|
||||
return None
|
||||
our_quality = base_result[0]
|
||||
our_ext = _HARTE_PAREN_EXT.get(alt)
|
||||
if our_ext is None:
|
||||
return None
|
||||
else:
|
||||
result = _HARTE_QUALITY.get(quality_str)
|
||||
if result is None:
|
||||
return None
|
||||
our_quality, our_ext = result
|
||||
|
||||
q_ext = our_quality + ("" if our_ext == "none" else our_ext)
|
||||
bass_part = "" if bass_note == "root" else f"/{bass_note}"
|
||||
return root + q_ext + bass_part
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# File parsing
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _parse_salami_file(
|
||||
path: Path,
|
||||
) -> tuple[dict[str, str], list[tuple[float, str, str]]]:
|
||||
"""Parse a salami_chords.txt file.
|
||||
|
||||
Returns:
|
||||
(header, events) where header maps lowercase field names to values,
|
||||
and events is a list of (timestamp, label, chord) triples.
|
||||
label may be 'Z', a section letter (possibly with ',function'), or '.'.
|
||||
chord is in Harte notation or '' when the column is absent.
|
||||
"""
|
||||
header: dict[str, str] = {}
|
||||
events: list[tuple[float, str, str]] = []
|
||||
|
||||
for raw in path.read_text(encoding="utf-8").splitlines():
|
||||
line = raw.strip()
|
||||
if not line:
|
||||
continue
|
||||
if line.startswith("#"):
|
||||
if ":" in line:
|
||||
content = line[1:].strip()
|
||||
k, v = content.split(":", 1)
|
||||
header[k.strip().lower()] = v.strip()
|
||||
continue
|
||||
parts = line.split("\t")
|
||||
if len(parts) < 2:
|
||||
continue
|
||||
try:
|
||||
ts = float(parts[0])
|
||||
except ValueError:
|
||||
continue
|
||||
label = parts[1].strip()
|
||||
chord = parts[2].strip() if len(parts) > 2 else ""
|
||||
events.append((ts, label, chord))
|
||||
|
||||
return header, events
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Section extraction
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _parse_section_label(label: str) -> tuple[str, str]:
|
||||
"""Parse 'A,verse' → (letter='A', function='verse')."""
|
||||
if "," in label:
|
||||
letter, func_raw = label.split(",", 1)
|
||||
func = _FUNCTION_MAP.get(func_raw.strip().lower(), "other")
|
||||
else:
|
||||
letter = label
|
||||
func = "other"
|
||||
return letter.strip(), func
|
||||
|
||||
|
||||
def _extract_sections(
|
||||
events: list[tuple[float, str, str]],
|
||||
) -> list[_Section]:
|
||||
"""Group raw event triples into _Section objects with _ChordEvent lists."""
|
||||
sections: list[_Section] = []
|
||||
current: Optional[_Section] = None
|
||||
timestamps = [e[0] for e in events]
|
||||
|
||||
for i, (ts, label, chord) in enumerate(events):
|
||||
dur = timestamps[i + 1] - ts if i + 1 < len(timestamps) else 0.0
|
||||
|
||||
if label in ("Z", ""):
|
||||
current = None
|
||||
continue
|
||||
|
||||
if label == ".":
|
||||
if current is not None and chord and dur > 0:
|
||||
current.events.append(_ChordEvent(ts, dur, chord))
|
||||
continue
|
||||
|
||||
# New section starts here
|
||||
letter, func = _parse_section_label(label)
|
||||
current = _Section(letter=letter, function=func)
|
||||
sections.append(current)
|
||||
if chord and dur > 0:
|
||||
current.events.append(_ChordEvent(ts, dur, chord))
|
||||
|
||||
return sections
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Bar quantization
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _estimate_bar_duration(durations: list[float]) -> float:
|
||||
"""Estimate duration of one bar in seconds.
|
||||
|
||||
Uses the median of non-trivial chord durations as a proxy for one bar.
|
||||
Clamped to [1.0, 5.0] s (covers ~48–240 BPM in 4/4).
|
||||
Falls back to 2.0 s when fewer than 3 samples.
|
||||
"""
|
||||
valid = [d for d in durations if d > 0.5]
|
||||
if len(valid) < 3:
|
||||
return 2.0
|
||||
return max(1.0, min(5.0, statistics.median(valid)))
|
||||
|
||||
|
||||
def _expected_positions(time: str, subdivision: int) -> int:
|
||||
"""Number of positions per bar for the given time signature and subdivision."""
|
||||
num, denom = (int(x) for x in time.split("/"))
|
||||
return (num * subdivision) // denom
|
||||
|
||||
|
||||
def _section_to_bars(
|
||||
section: _Section,
|
||||
bar_duration: float,
|
||||
time: str,
|
||||
subdivision: int,
|
||||
) -> Optional[list[list[str]]]:
|
||||
"""Convert a section's chord events to a list of bars.
|
||||
|
||||
Returns None if any event contains an unrecognized Harte chord symbol;
|
||||
the caller will skip the section and log a reason.
|
||||
"""
|
||||
positions_per_bar = _expected_positions(time, subdivision)
|
||||
bars: list[list[str]] = []
|
||||
|
||||
for event in section.events:
|
||||
if event.harte == "N":
|
||||
first_pos = "NC"
|
||||
elif event.harte == "X":
|
||||
first_pos = "?"
|
||||
else:
|
||||
sym = _harte_to_chord_symbol(event.harte)
|
||||
if sym is None:
|
||||
log.debug(
|
||||
"unrecognized Harte chord %r in section %s",
|
||||
event.harte, section.letter,
|
||||
)
|
||||
return None
|
||||
first_pos = sym
|
||||
|
||||
n_bars = max(1, round(event.duration / bar_duration))
|
||||
bars.append([first_pos] + ["."] * (positions_per_bar - 1))
|
||||
for _ in range(n_bars - 1):
|
||||
# Hold chord across additional bars
|
||||
bars.append(["."] * positions_per_bar)
|
||||
|
||||
return bars
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Mode inference
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _infer_mode(tonic: str, sections: list[_Section]) -> str:
|
||||
"""Determine 'major' or 'minor' from tonic chord quality distribution.
|
||||
|
||||
Counts occurrences of the tonic root in major-family vs minor-family
|
||||
qualities across all sections. Returns 'major' on a tie or no data.
|
||||
"""
|
||||
major_count = 0
|
||||
minor_count = 0
|
||||
|
||||
for section in sections:
|
||||
for event in section.events:
|
||||
if not event.harte or event.harte in ("N", "X"):
|
||||
continue
|
||||
# Extract root without a full Harte parse
|
||||
colon = event.harte.find(":")
|
||||
root_part = event.harte[:colon] if colon != -1 else event.harte
|
||||
root_str = root_part.split("/")[0]
|
||||
if len(root_str) >= 2 and root_str[1] in "#b":
|
||||
raw_root = root_str[:2]
|
||||
else:
|
||||
raw_root = root_str[:1]
|
||||
if not raw_root:
|
||||
continue
|
||||
root = _normalize_note(raw_root)
|
||||
if root != tonic:
|
||||
continue
|
||||
# Extract quality
|
||||
quality_str = event.harte[colon + 1:] if colon != -1 else ""
|
||||
if "/" in quality_str:
|
||||
quality_str = quality_str[: quality_str.index("/")]
|
||||
base = re.sub(r'\([^)]*\)', "", quality_str).strip()
|
||||
result = _HARTE_QUALITY.get(base)
|
||||
if result is None:
|
||||
continue
|
||||
our_quality = result[0]
|
||||
if our_quality in _MAJOR_QUALITIES:
|
||||
major_count += 1
|
||||
elif our_quality in _MINOR_QUALITIES:
|
||||
minor_count += 1
|
||||
|
||||
return "minor" if minor_count > major_count else "major"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Metre parsing
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _parse_metre(metre: str) -> tuple[Optional[str], int]:
|
||||
"""Parse metre string → (time_sig, subdivision). Returns (None, 0) if unsupported."""
|
||||
m = metre.strip()
|
||||
if m in _VALID_TIMES:
|
||||
sub = 8 if m in ("6/8", "12/8") else 4
|
||||
return m, sub
|
||||
try:
|
||||
mapping = {4: ("4/4", 4), 3: ("3/4", 4), 2: ("2/4", 4)}
|
||||
return mapping.get(int(m), (None, 0))
|
||||
except ValueError:
|
||||
return None, 0
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# File writing
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _write_chord_file(
|
||||
path: Path,
|
||||
title: str,
|
||||
key: str,
|
||||
time: str,
|
||||
subdivision: int,
|
||||
function: Optional[str],
|
||||
bars: list[list[str]],
|
||||
) -> None:
|
||||
"""Write a harmonic period to a .chord file."""
|
||||
lines = [
|
||||
f"# title: {title}",
|
||||
f"# key: {key}",
|
||||
f"# time: {time}",
|
||||
f"# subdivision: {subdivision}",
|
||||
"# style: other",
|
||||
]
|
||||
if function:
|
||||
lines.append(f"# function: {function}")
|
||||
lines.append("") # blank line before body
|
||||
|
||||
for i in range(0, len(bars), 4):
|
||||
chunk = bars[i : i + 4]
|
||||
line = " ".join(f"| {' '.join(b)}" for b in chunk) + " |"
|
||||
lines.append(line)
|
||||
|
||||
path.write_text("\n".join(lines) + "\n", encoding="utf-8")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public API
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def convert_song(song_dir: Path, output_dir: Path) -> int:
|
||||
"""Convert one McGill Billboard song directory to .chord files.
|
||||
|
||||
Args:
|
||||
song_dir: Directory containing salami_chords.txt (e.g. 0003/).
|
||||
output_dir: Destination directory for .chord files (created if absent).
|
||||
|
||||
Returns:
|
||||
Number of .chord files successfully written.
|
||||
"""
|
||||
salami = song_dir / "salami_chords.txt"
|
||||
if not salami.exists():
|
||||
log.warning("no salami_chords.txt in %s, skipping", song_dir)
|
||||
return 0
|
||||
|
||||
try:
|
||||
header, raw_events = _parse_salami_file(salami)
|
||||
except Exception as exc:
|
||||
log.error("failed to parse %s: %s", salami, exc)
|
||||
return 0
|
||||
|
||||
song_id = song_dir.name
|
||||
|
||||
time_sig, subdivision = _parse_metre(header.get("metre", "4/4"))
|
||||
if time_sig is None:
|
||||
log.warning(
|
||||
"unsupported metre %r in %s, skipping", header.get("metre"), song_dir
|
||||
)
|
||||
return 0
|
||||
|
||||
tonic_raw = header.get("tonic", "C").strip()
|
||||
tonic = _normalize_note(tonic_raw) or "C"
|
||||
|
||||
sections = _extract_sections(raw_events)
|
||||
if not sections:
|
||||
log.warning("no sections found in %s", salami)
|
||||
return 0
|
||||
|
||||
all_durations = [
|
||||
e.duration
|
||||
for s in sections
|
||||
for e in s.events
|
||||
if e.harte not in ("N", "X", "") and e.duration > 0.5
|
||||
]
|
||||
bar_duration = _estimate_bar_duration(all_durations)
|
||||
mode = _infer_mode(tonic, sections)
|
||||
key = f"{tonic}_{mode}"
|
||||
|
||||
artist = header.get("artist", "unknown")
|
||||
song_title = header.get("title", "unknown")
|
||||
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
n_saved = 0
|
||||
skip_reasons: Counter[str] = Counter()
|
||||
|
||||
for idx, section in enumerate(sections):
|
||||
bars = _section_to_bars(section, bar_duration, time_sig, subdivision)
|
||||
if bars is None:
|
||||
skip_reasons["unrecognized_chord"] += 1
|
||||
continue
|
||||
|
||||
n = len(bars)
|
||||
if n < 4:
|
||||
log.debug(
|
||||
"section %s in %s: %d bar(s) < 4, skipping",
|
||||
section.letter, song_id, n,
|
||||
)
|
||||
skip_reasons["too_short"] += 1
|
||||
continue
|
||||
if n > 16:
|
||||
log.debug(
|
||||
"section %s in %s: %d bars > 16, skipping",
|
||||
section.letter, song_id, n,
|
||||
)
|
||||
skip_reasons["too_long"] += 1
|
||||
continue
|
||||
|
||||
func = section.function
|
||||
filename = f"mcgill_{song_id}_{idx:02d}_{func}.chord"
|
||||
out_path = output_dir / filename
|
||||
period_title = f"{artist} - {song_title} ({section.letter},{func})"
|
||||
_write_chord_file(
|
||||
out_path, period_title, key, time_sig, subdivision,
|
||||
func if func != "unspecified" else None, bars,
|
||||
)
|
||||
n_saved += 1
|
||||
log.debug("wrote %s", out_path.name)
|
||||
|
||||
if skip_reasons:
|
||||
log.info(
|
||||
"song %s: saved=%d skipped=%s", song_id, n_saved, dict(skip_reasons)
|
||||
)
|
||||
else:
|
||||
log.info("song %s: saved %d period(s)", song_id, n_saved)
|
||||
|
||||
return n_saved
|
||||
|
||||
|
||||
def convert_dataset(dataset_dir: Path, output_dir: Path) -> tuple[int, int]:
|
||||
"""Convert all song directories in a McGill Billboard dataset.
|
||||
|
||||
Args:
|
||||
dataset_dir: Root directory containing per-song subdirectories.
|
||||
output_dir: Destination directory for .chord files.
|
||||
|
||||
Returns:
|
||||
(n_saved, n_empty) where n_empty counts songs that produced no output.
|
||||
"""
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
n_saved = 0
|
||||
n_empty = 0
|
||||
|
||||
for song_dir in sorted(d for d in dataset_dir.iterdir() if d.is_dir()):
|
||||
saved = convert_song(song_dir, output_dir)
|
||||
n_saved += saved
|
||||
if saved == 0:
|
||||
n_empty += 1
|
||||
|
||||
log.info(
|
||||
"conversion complete: %d periods saved, %d songs produced no output",
|
||||
n_saved, n_empty,
|
||||
)
|
||||
return n_saved, n_empty
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CLI entry point
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Convert McGill Billboard dataset to .chord files.",
|
||||
epilog=(
|
||||
"Example:\n"
|
||||
" python -m src.external_converters.mcgill_to_chord "
|
||||
"data/raw_external/mcgill/ --out data/raw_external/mcgill_converted/"
|
||||
),
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
parser.add_argument(
|
||||
"dataset_dir", type=Path, metavar="dataset_dir",
|
||||
help="directory containing per-song subdirectories (0003/, 0004/, ...)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--out", type=Path,
|
||||
default=Path("data/raw_external/mcgill_converted"),
|
||||
metavar="output_dir",
|
||||
help="destination for .chord files (default: data/raw_external/mcgill_converted/)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--log-level", default="INFO",
|
||||
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
||||
help="logging verbosity (default: INFO)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
logging.basicConfig(level=getattr(logging, args.log_level), format="%(message)s")
|
||||
n_saved, n_empty = convert_dataset(args.dataset_dir, args.out)
|
||||
print(f"Saved {n_saved} periods. {n_empty} song(s) produced no output.")
|
||||
print(f"Output: {args.out}")
|
||||
@@ -0,0 +1,15 @@
|
||||
# artist: Test Artist
|
||||
# title: Test Song
|
||||
# metre: 4/4
|
||||
# tonic: C
|
||||
|
||||
0.000000 Z
|
||||
4.000000 A,verse C:maj
|
||||
8.000000 . F:maj
|
||||
12.000000 . G:7
|
||||
16.000000 . C:maj
|
||||
20.000000 B,chorus F:maj
|
||||
24.000000 . C:maj
|
||||
28.000000 . G:7
|
||||
32.000000 . C:maj
|
||||
36.000000 Z
|
||||
@@ -0,0 +1,265 @@
|
||||
"""Tests for src/external_converters/mcgill_to_chord.py.
|
||||
|
||||
Fixture: tests/fixtures/mcgill_test/0001/salami_chords.txt
|
||||
4/4 song in C major, two sections:
|
||||
Section A (verse): C:maj F:maj G:7 C:maj — 4 chords × 4.0 s each
|
||||
Section B (chorus): F:maj C:maj G:7 C:maj — 4 chords × 4.0 s each
|
||||
|
||||
Expected output: 2 .chord files, each with 4 bars, key=C_major, time=4/4.
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from src.external_converters.mcgill_to_chord import (
|
||||
_estimate_bar_duration,
|
||||
_extract_sections,
|
||||
_harte_to_chord_symbol,
|
||||
_infer_mode,
|
||||
_parse_metre,
|
||||
_parse_salami_file,
|
||||
_section_to_bars,
|
||||
convert_song,
|
||||
)
|
||||
from src.tokenizer import parse_chord_file
|
||||
|
||||
FIXTURES = Path(__file__).parent / "fixtures" / "mcgill_test"
|
||||
TEST_SONG = FIXTURES / "0001"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Harte chord symbol conversion
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestHarteConversion:
|
||||
"""Unit tests for individual Harte → .chord symbol conversion."""
|
||||
|
||||
def test_simple_major(self):
|
||||
assert _harte_to_chord_symbol("C:maj") == "Cmaj"
|
||||
|
||||
def test_flat_minor_seventh(self):
|
||||
# Bb normalises to A#
|
||||
assert _harte_to_chord_symbol("Bb:min7") == "A#m7"
|
||||
|
||||
def test_half_diminished(self):
|
||||
# hdim7 = half-diminished 7th = our m7b5
|
||||
assert _harte_to_chord_symbol("E:hdim7") == "Em7b5"
|
||||
|
||||
def test_dominant_seventh(self):
|
||||
assert _harte_to_chord_symbol("G:7") == "G7"
|
||||
|
||||
def test_major_seventh(self):
|
||||
assert _harte_to_chord_symbol("D:maj7") == "Dmaj7"
|
||||
|
||||
def test_minor(self):
|
||||
assert _harte_to_chord_symbol("A:min") == "Am"
|
||||
|
||||
def test_diminished_seventh(self):
|
||||
assert _harte_to_chord_symbol("B:dim7") == "Bdim7"
|
||||
|
||||
def test_augmented(self):
|
||||
assert _harte_to_chord_symbol("C:aug") == "Caug"
|
||||
|
||||
def test_slash_chord(self):
|
||||
assert _harte_to_chord_symbol("C:maj/E") == "Cmaj/E"
|
||||
|
||||
def test_slash_chord_flat_bass(self):
|
||||
# Flat bass note also normalised to sharp
|
||||
assert _harte_to_chord_symbol("G:maj/Bb") == "Gmaj/A#"
|
||||
|
||||
def test_no_chord_returns_none(self):
|
||||
assert _harte_to_chord_symbol("N") is None
|
||||
|
||||
def test_unknown_returns_none(self):
|
||||
assert _harte_to_chord_symbol("X") is None
|
||||
|
||||
def test_empty_returns_none(self):
|
||||
assert _harte_to_chord_symbol("") is None
|
||||
|
||||
def test_extended_dominant_ninth(self):
|
||||
# G:9 → dominant 7 + extension 9
|
||||
assert _harte_to_chord_symbol("G:9") == "G79"
|
||||
|
||||
def test_major_ninth(self):
|
||||
assert _harte_to_chord_symbol("C:maj9") == "Cmaj79"
|
||||
|
||||
def test_parenthetical_flat_nine(self):
|
||||
assert _harte_to_chord_symbol("C:7(b9)") == "C7b9"
|
||||
|
||||
def test_parenthetical_sharp_eleven(self):
|
||||
assert _harte_to_chord_symbol("F:maj7(#11)") == "Fmaj7#11"
|
||||
|
||||
def test_sharp_root(self):
|
||||
assert _harte_to_chord_symbol("F#:min7") == "F#m7"
|
||||
|
||||
def test_output_is_parseable(self):
|
||||
from src.chord_parser import parse_chord_symbol
|
||||
for harte in ("C:maj", "Bb:min7", "E:hdim7", "G:7", "D:maj7", "C:maj/E"):
|
||||
sym = _harte_to_chord_symbol(harte)
|
||||
assert sym is not None
|
||||
parse_chord_symbol(sym) # must not raise
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helper units
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestParseSalamiFile:
|
||||
def test_header_parsed(self):
|
||||
header, _ = _parse_salami_file(TEST_SONG / "salami_chords.txt")
|
||||
assert header["artist"] == "Test Artist"
|
||||
assert header["title"] == "Test Song"
|
||||
assert header["metre"] == "4/4"
|
||||
assert header["tonic"] == "C"
|
||||
|
||||
def test_events_count(self):
|
||||
_, events = _parse_salami_file(TEST_SONG / "salami_chords.txt")
|
||||
# 10 data lines total (including Z lines)
|
||||
assert len(events) == 10
|
||||
|
||||
def test_first_event_is_silence(self):
|
||||
_, events = _parse_salami_file(TEST_SONG / "salami_chords.txt")
|
||||
ts, label, chord = events[0]
|
||||
assert ts == 0.0
|
||||
assert label == "Z"
|
||||
|
||||
|
||||
class TestExtractSections:
|
||||
def test_two_sections(self):
|
||||
_, events = _parse_salami_file(TEST_SONG / "salami_chords.txt")
|
||||
sections = _extract_sections(events)
|
||||
assert len(sections) == 2
|
||||
|
||||
def test_section_functions(self):
|
||||
_, events = _parse_salami_file(TEST_SONG / "salami_chords.txt")
|
||||
sections = _extract_sections(events)
|
||||
assert sections[0].function == "verse"
|
||||
assert sections[1].function == "chorus"
|
||||
|
||||
def test_events_per_section(self):
|
||||
_, events = _parse_salami_file(TEST_SONG / "salami_chords.txt")
|
||||
sections = _extract_sections(events)
|
||||
assert len(sections[0].events) == 4
|
||||
assert len(sections[1].events) == 4
|
||||
|
||||
def test_chord_values(self):
|
||||
_, events = _parse_salami_file(TEST_SONG / "salami_chords.txt")
|
||||
sections = _extract_sections(events)
|
||||
hartes = [e.harte for e in sections[0].events]
|
||||
assert hartes == ["C:maj", "F:maj", "G:7", "C:maj"]
|
||||
|
||||
|
||||
class TestEstimateBarDuration:
|
||||
def test_uniform_durations(self):
|
||||
assert _estimate_bar_duration([2.0, 2.0, 2.0, 2.0]) == 2.0
|
||||
|
||||
def test_mixed_durations(self):
|
||||
# Median of [2, 2, 2, 4, 4] = 2 → bar_dur = 2
|
||||
assert _estimate_bar_duration([2.0, 2.0, 2.0, 4.0, 4.0]) == 2.0
|
||||
|
||||
def test_too_few_samples_returns_default(self):
|
||||
assert _estimate_bar_duration([]) == 2.0
|
||||
assert _estimate_bar_duration([3.0]) == 2.0
|
||||
|
||||
def test_clamp_upper(self):
|
||||
assert _estimate_bar_duration([10.0, 10.0, 10.0]) == 5.0
|
||||
|
||||
def test_clamp_lower(self):
|
||||
assert _estimate_bar_duration([0.3, 0.3, 0.3]) == 2.0 # all < 0.5, falls back
|
||||
|
||||
|
||||
class TestParseMetre:
|
||||
def test_4_4(self):
|
||||
assert _parse_metre("4/4") == ("4/4", 4)
|
||||
|
||||
def test_3_4(self):
|
||||
assert _parse_metre("3/4") == ("3/4", 4)
|
||||
|
||||
def test_6_8(self):
|
||||
assert _parse_metre("6/8") == ("6/8", 8)
|
||||
|
||||
def test_integer_4(self):
|
||||
assert _parse_metre("4") == ("4/4", 4)
|
||||
|
||||
def test_unsupported(self):
|
||||
sig, sub = _parse_metre("7/8")
|
||||
assert sig is None
|
||||
assert sub == 0
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Full period conversion
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestFullConversion:
|
||||
"""Integration tests: convert_song with fixture produces valid .chord files."""
|
||||
|
||||
def test_returns_two_periods(self, tmp_path):
|
||||
assert convert_song(TEST_SONG, tmp_path) == 2
|
||||
|
||||
def test_output_files_exist(self, tmp_path):
|
||||
convert_song(TEST_SONG, tmp_path)
|
||||
assert len(list(tmp_path.glob("*.chord"))) == 2
|
||||
|
||||
def test_output_files_are_parseable(self, tmp_path):
|
||||
convert_song(TEST_SONG, tmp_path)
|
||||
for f in tmp_path.glob("*.chord"):
|
||||
assert parse_chord_file(f) is not None # must not raise
|
||||
|
||||
def test_verse_has_four_bars(self, tmp_path):
|
||||
convert_song(TEST_SONG, tmp_path)
|
||||
verse_files = sorted(tmp_path.glob("*verse*.chord"))
|
||||
assert len(verse_files) == 1
|
||||
assert len(parse_chord_file(verse_files[0]).bars) == 4
|
||||
|
||||
def test_chorus_has_four_bars(self, tmp_path):
|
||||
convert_song(TEST_SONG, tmp_path)
|
||||
chorus_files = sorted(tmp_path.glob("*chorus*.chord"))
|
||||
assert len(chorus_files) == 1
|
||||
assert len(parse_chord_file(chorus_files[0]).bars) == 4
|
||||
|
||||
def test_header_time_and_subdivision(self, tmp_path):
|
||||
convert_song(TEST_SONG, tmp_path)
|
||||
for f in tmp_path.glob("*.chord"):
|
||||
p = parse_chord_file(f)
|
||||
assert p.time == "4/4"
|
||||
assert p.subdivision == 4
|
||||
|
||||
def test_style_is_other(self, tmp_path):
|
||||
convert_song(TEST_SONG, tmp_path)
|
||||
for f in tmp_path.glob("*.chord"):
|
||||
assert parse_chord_file(f).style == "other"
|
||||
|
||||
def test_key_is_c_major(self, tmp_path):
|
||||
convert_song(TEST_SONG, tmp_path)
|
||||
for f in tmp_path.glob("*.chord"):
|
||||
assert parse_chord_file(f).key == "C_major"
|
||||
|
||||
def test_function_tags(self, tmp_path):
|
||||
convert_song(TEST_SONG, tmp_path)
|
||||
funcs = {parse_chord_file(f).function for f in tmp_path.glob("*.chord")}
|
||||
assert funcs == {"verse", "chorus"}
|
||||
|
||||
def test_filenames_contain_song_id(self, tmp_path):
|
||||
convert_song(TEST_SONG, tmp_path)
|
||||
names = {f.name for f in tmp_path.glob("*.chord")}
|
||||
assert all("0001" in name for name in names)
|
||||
|
||||
def test_bar_positions_are_valid_chords(self, tmp_path):
|
||||
from src.chord_parser import parse_chord_symbol
|
||||
convert_song(TEST_SONG, tmp_path)
|
||||
for f in tmp_path.glob("*.chord"):
|
||||
p = parse_chord_file(f)
|
||||
for bar in p.bars:
|
||||
first = bar[0]
|
||||
if first not in (".", "NC", "?"):
|
||||
parse_chord_symbol(first) # must not raise
|
||||
|
||||
def test_missing_salami_returns_zero(self, tmp_path):
|
||||
empty_song = tmp_path / "empty"
|
||||
empty_song.mkdir()
|
||||
assert convert_song(empty_song, tmp_path / "out") == 0
|
||||
Reference in New Issue
Block a user