feat: add dataset, prepare_data pipeline and fix McGill converter

- src/dataset.py: ChordDataset wrapping .pt files with pad/truncate
- scripts/prepare_data.py: tokenize .chord to .pt with train/val/holdout
  split, logs token length stats and style/function distributions
- src/external_converters/mcgill_to_chord.py: rewrite parser for real
  McGill v2 format (2-column annotation, each bar in its own pipe group,
  interval bass notation e.g. /5 and /b3)
- .gitignore: exclude data/processed/train, val, holdout subdirectories
- tests: 37 new tests for ChordDataset and converter (260 total, all pass)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-19 18:09:46 +03:00
parent ea32bf43b2
commit 84ba7b4743
7 changed files with 876 additions and 314 deletions
+52
View File
@@ -0,0 +1,52 @@
"""PyTorch Dataset for tokenized .chord period files.
Public API:
ChordDataset — Dataset that loads pre-tokenized .pt files from a directory.
"""
from __future__ import annotations
import logging
from pathlib import Path
import torch
from torch.utils.data import Dataset
from src.tokenizer import TOKEN_TO_ID
log = logging.getLogger(__name__)
_PAD_ID: int = TOKEN_TO_ID["<PAD>"]
class ChordDataset(Dataset):
"""Dataset over a directory of tokenized .pt period files.
Each .pt file must be a dict ``{"tokens": LongTensor, "meta": dict}``.
``__getitem__`` returns a fixed-length LongTensor: the token sequence is
truncated to *max_length* if too long, or right-padded with <PAD> if short.
Args:
data_dir: Directory containing .pt files (non-recursive).
max_length: Fixed output sequence length. Default 512.
"""
def __init__(self, data_dir: Path, max_length: int = 512) -> None:
self._max_length = max_length
self._files: list[Path] = sorted(Path(data_dir).glob("*.pt"))
if not self._files:
log.warning("ChordDataset: no .pt files found in %s", data_dir)
def __len__(self) -> int:
return len(self._files)
def __getitem__(self, idx: int) -> torch.Tensor:
data = torch.load(self._files[idx], weights_only=True)
tokens: torch.Tensor = data["tokens"]
length = tokens.shape[0]
if length >= self._max_length:
return tokens[: self._max_length]
pad = torch.full((self._max_length - length,), _PAD_ID, dtype=tokens.dtype)
return torch.cat([tokens, pad])
+260 -238
View File
@@ -1,19 +1,24 @@
"""Convert McGill Billboard dataset (salami_chords.txt) to .chord files.
McGill Billboard format:
McGill Billboard v2 format:
Each song is a subdirectory (e.g. 0003/, 0004/) containing salami_chords.txt.
The file has a header (# key: value) followed by tab-separated data lines:
<timestamp>\\t<section_label>\\t<chord>
Header: # key: value lines (artist, title, metre, tonic).
Data: tab-separated pairs <timestamp>\\t<annotation> where annotation is:
- "silence" / "end" — structural boundary (no chord data)
- "[Letter[, function,]] | bar1 | bar2 | ... |"
Each | ... | group is ONE BAR; space-separated tokens inside are
beat-level chord changes within that bar.
- "| ... | xN" — the bar(s) repeated N times
Section labels: 'Z' (silence/boundary), a letter (e.g. 'A', 'B,verse'), or '.' (continuation).
Chords: Harte notation (e.g. C:maj, Bb:min7, N for no chord, X for unknown).
Bass notes in Harte may be absolute (e.g. '/E') or scale-degree intervals
(e.g. '/5' = perfect fifth, '/b3' = minor third above root).
Public API:
convert_dataset(dataset_dir, output_dir) -- convert entire dataset directory
convert_dataset(dataset_dir, output_dir) -- convert entire dataset
convert_song(song_dir, output_dir) -- convert one song directory
CLI:
python -m src.external_converters.mcgill_to_chord <dataset_dir> [--out <output_dir>]
python -m src.external_converters.mcgill_to_chord <dataset_dir> [--out ]
Example:
python -m src.external_converters.mcgill_to_chord data/raw_external/mcgill/ \\
@@ -25,14 +30,35 @@ from __future__ import annotations
import argparse
import logging
import re
import statistics
from collections import Counter
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional
log = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Note tables
# ---------------------------------------------------------------------------
_CHROMATIC: list[str] = [
"C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"
]
_NOTE_INDEX: dict[str, int] = {n: i for i, n in enumerate(_CHROMATIC)}
_FLAT_TO_SHARP: dict[str, str] = {
"Cb": "B", "Db": "C#", "Eb": "D#", "Fb": "E",
"Gb": "F#", "Ab": "G#", "Bb": "A#",
}
_VALID_NOTES: frozenset[str] = frozenset(_CHROMATIC)
# Harte scale-degree intervals: semitones above root
_HARTE_INTERVAL: dict[str, int] = {
"1": 0, "b2": 1, "2": 2, "b3": 3, "3": 4, "4": 5,
"#4": 6, "b5": 6, "5": 7, "#5": 8, "b6": 8, "6": 9,
"b7": 10, "7": 11,
}
# ---------------------------------------------------------------------------
# Harte quality → (our_quality, our_extension)
# ---------------------------------------------------------------------------
@@ -63,12 +89,11 @@ _HARTE_QUALITY: dict[str, tuple[str, str]] = {
"13": ("7", "13"),
"maj13": ("maj7", "13"),
"min13": ("m7", "13"),
"1": ("maj", "none"), # root only → major
"5": ("maj", "none"), # power chord → major (no 3rd)
"": ("maj", "none"), # bare root
"1": ("maj", "none"),
"5": ("maj", "none"),
"": ("maj", "none"),
}
# Parenthetical alterations in Harte (e.g. '7(b9)') → our extension token
_HARTE_PAREN_EXT: dict[str, str] = {
"b9": "b9",
"#9": "#9",
@@ -79,7 +104,6 @@ _HARTE_PAREN_EXT: dict[str, str] = {
"9": "9",
}
# McGill Billboard section function strings → our function tokens
_FUNCTION_MAP: dict[str, str] = {
"intro": "intro",
"verse": "verse",
@@ -92,7 +116,7 @@ _FUNCTION_MAP: dict[str, str] = {
"bridge": "bridge",
"outro": "outro",
"coda": "outro",
"end": "outro",
"ending": "outro",
"interlude": "interlude",
"instrumental": "interlude",
"solo": "interlude",
@@ -101,18 +125,8 @@ _FUNCTION_MAP: dict[str, str] = {
"other": "other",
}
_VALID_NOTES: frozenset[str] = frozenset(
{"C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"}
)
_FLAT_TO_SHARP: dict[str, str] = {
"Cb": "B", "Db": "C#", "Eb": "D#", "Fb": "E",
"Gb": "F#", "Ab": "G#", "Bb": "A#",
}
_VALID_TIMES: frozenset[str] = frozenset({"4/4", "3/4", "6/8", "2/4", "12/8"})
# Quality families used for mode inference
_MAJOR_QUALITIES: frozenset[str] = frozenset(
{"maj", "maj7", "6", "add9", "aug", "sus2", "sus4", "7sus4", "aug7"}
)
@@ -120,25 +134,6 @@ _MINOR_QUALITIES: frozenset[str] = frozenset(
{"m", "m7", "mM7", "m6", "m7b5", "dim", "dim7"}
)
# ---------------------------------------------------------------------------
# Internal data structures
# ---------------------------------------------------------------------------
@dataclass
class _ChordEvent:
start: float
duration: float # seconds
harte: str # Harte chord string: 'N', 'X', 'C:maj', etc.
@dataclass
class _Section:
letter: str # section letter, e.g. 'A', 'B'
function: str # our function token, e.g. 'verse', 'chorus'
events: list[_ChordEvent] = field(default_factory=list)
# ---------------------------------------------------------------------------
# Note / chord helpers
# ---------------------------------------------------------------------------
@@ -150,35 +145,49 @@ def _normalize_note(raw: str) -> Optional[str]:
return note if note in _VALID_NOTES else None
def _resolve_harte_bass(root: str, bass_str: str) -> Optional[str]:
"""Convert Harte bass notation to an absolute sharp note name.
Supports absolute notes ('E', 'Bb') and scale-degree intervals ('5', 'b3').
"""
bass_str = bass_str.strip()
if not bass_str:
return None
# Absolute note: starts with AG
if bass_str[0] in "ABCDEFG":
if len(bass_str) >= 2 and bass_str[1] in "#b":
raw, tail = bass_str[:2], bass_str[2:]
else:
raw, tail = bass_str[:1], bass_str[1:]
if tail:
return None
return _normalize_note(raw)
# Scale-degree interval
interval = _HARTE_INTERVAL.get(bass_str)
if interval is None:
return None
root_idx = _NOTE_INDEX[root]
return _CHROMATIC[(root_idx + interval) % 12]
def _harte_to_chord_symbol(harte: str) -> Optional[str]:
"""Convert a Harte chord string to our .chord format symbol.
"""Convert a Harte chord string to our .chord symbol.
Args:
harte: Harte notation string, e.g. 'C:maj', 'Bb:min7', 'E:hdim7/G#'.
harte: Harte notation, e.g. 'C:maj', 'Bb:min7', 'F:maj/5', 'G:7(b9)'.
Returns:
Our chord symbol (e.g. 'Cmaj', 'A#m7', 'Em7b5/G#'), or None for
Our chord symbol (e.g. 'Cmaj', 'A#m7', 'Fmaj/C'), or None for
N (no chord), X (unknown), or any unparseable input.
"""
harte = harte.strip()
if harte in ("N", "X", ""):
return None
# Extract slash bass note (rightmost '/')
bass_note = "root"
# Extract slash bass (rightmost '/')
bass_raw: Optional[str] = None
if "/" in harte:
main, bass_raw = harte.rsplit("/", 1)
if len(bass_raw) >= 2 and bass_raw[1] in "#b":
raw_b, tail = bass_raw[:2], bass_raw[2:]
else:
raw_b, tail = bass_raw[:1], bass_raw[1:]
if tail or not raw_b:
return None
bn = _normalize_note(raw_b)
if bn is None:
return None
bass_note = bn
harte = main
harte, bass_raw = harte.rsplit("/", 1)
# Split root from quality on first ':'
if ":" in harte:
@@ -202,6 +211,14 @@ def _harte_to_chord_symbol(harte: str) -> Optional[str]:
if root is None:
return None
# Resolve bass now that root is known
bass_note = "root"
if bass_raw is not None:
resolved = _resolve_harte_bass(root, bass_raw)
if resolved is None:
return None
bass_note = resolved
# Parse quality — handle parenthetical alterations like '7(b9)'
m = re.match(r'^([^(]*)\(([^)]+)\)$', quality_str)
if m:
@@ -231,17 +248,15 @@ def _harte_to_chord_symbol(harte: str) -> Optional[str]:
def _parse_salami_file(
path: Path,
) -> tuple[dict[str, str], list[tuple[float, str, str]]]:
) -> tuple[dict[str, str], list[tuple[float, str]]]:
"""Parse a salami_chords.txt file.
Returns:
(header, events) where header maps lowercase field names to values,
and events is a list of (timestamp, label, chord) triples.
label may be 'Z', a section letter (possibly with ',function'), or '.'.
chord is in Harte notation or '' when the column is absent.
(header, data_lines) where header maps lowercase field names to values
and data_lines is a list of (timestamp, annotation_string) pairs.
"""
header: dict[str, str] = {}
events: list[tuple[float, str, str]] = []
data_lines: list[tuple[float, str]] = []
for raw in path.read_text(encoding="utf-8").splitlines():
line = raw.strip()
@@ -253,126 +268,118 @@ def _parse_salami_file(
k, v = content.split(":", 1)
header[k.strip().lower()] = v.strip()
continue
parts = line.split("\t")
parts = line.split("\t", 1)
if len(parts) < 2:
continue
try:
ts = float(parts[0])
except ValueError:
continue
label = parts[1].strip()
chord = parts[2].strip() if len(parts) > 2 else ""
events.append((ts, label, chord))
data_lines.append((ts, parts[1].strip()))
return header, events
return header, data_lines
# ---------------------------------------------------------------------------
# Section extraction
# Annotation line parsing
# ---------------------------------------------------------------------------
def _parse_section_label(label: str) -> tuple[str, str]:
"""Parse 'A,verse' → (letter='A', function='verse')."""
if "," in label:
letter, func_raw = label.split(",", 1)
func = _FUNCTION_MAP.get(func_raw.strip().lower(), "other")
def _parse_annotation_line(
annotation: str,
) -> tuple[Optional[str], Optional[str], list[str]]:
"""Parse one annotation string into (section_letter, function, bar_strings).
bar_strings is a list of bar content strings, one per bar.
Returns (None, None, []) for silence/end/empty/continuation-only lines.
"""
annotation = annotation.strip()
if not annotation or annotation.lower() in ("silence", "end"):
return None, None, []
if annotation.startswith("->"):
return None, None, []
section_letter: Optional[str] = None
function: Optional[str] = None
first_pipe = annotation.find("|")
if first_pipe == -1:
prefix = annotation
bar_section = ""
else:
letter = label
func = "other"
return letter.strip(), func
prefix = annotation[:first_pipe]
bar_section = annotation[first_pipe:]
# Parse optional section header before first '|'
if prefix.strip():
parts = [p.strip() for p in prefix.rstrip(",").split(",")]
if parts and len(parts[0]) == 1 and parts[0].isupper():
section_letter = parts[0]
if len(parts) > 1 and parts[1]:
function = _FUNCTION_MAP.get(parts[1].lower(), "other")
if not bar_section:
return section_letter, function, []
# Split on '|': odd-indexed parts are bar contents, last part is trailing
raw_parts = bar_section.split("|")
# raw_parts[0] is before first '|' (empty or whitespace)
# raw_parts[-1] is after last '|' (trailing annotation / xN)
trailing = raw_parts[-1].strip() if raw_parts else ""
intermediate = raw_parts[1:-1] # bar contents between pipes
bar_strings = [p.strip() for p in intermediate if p.strip()]
# Handle xN repeat: "x4" in trailing → repeat all bars N times
xN = re.match(r"x(\d+)\b", trailing)
if xN and bar_strings:
bar_strings = bar_strings * int(xN.group(1))
return section_letter, function, bar_strings
def _extract_sections(
events: list[tuple[float, str, str]],
) -> list[_Section]:
"""Group raw event triples into _Section objects with _ChordEvent lists."""
sections: list[_Section] = []
current: Optional[_Section] = None
timestamps = [e[0] for e in events]
def _bar_str_to_positions(bar_content: str, n_positions: int) -> Optional[list[str]]:
"""Convert bar content string to a fixed-length position list.
for i, (ts, label, chord) in enumerate(events):
dur = timestamps[i + 1] - ts if i + 1 < len(timestamps) else 0.0
if label in ("Z", ""):
current = None
continue
if label == ".":
if current is not None and chord and dur > 0:
current.events.append(_ChordEvent(ts, dur, chord))
continue
# New section starts here
letter, func = _parse_section_label(label)
current = _Section(letter=letter, function=func)
sections.append(current)
if chord and dur > 0:
current.events.append(_ChordEvent(ts, dur, chord))
return sections
# ---------------------------------------------------------------------------
# Bar quantization
# ---------------------------------------------------------------------------
def _estimate_bar_duration(durations: list[float]) -> float:
"""Estimate duration of one bar in seconds.
Uses the median of non-trivial chord durations as a proxy for one bar.
Clamped to [1.0, 5.0] s (covers ~48240 BPM in 4/4).
Falls back to 2.0 s when fewer than 3 samples.
Distributes space-separated chord elements across n_positions slots.
Returns None if any element is an unrecognized chord symbol.
"""
valid = [d for d in durations if d > 0.5]
if len(valid) < 3:
return 2.0
return max(1.0, min(5.0, statistics.median(valid)))
# Filter out performance annotations: keep only chord-like tokens
raw_elements = bar_content.split()
elements = [e for e in raw_elements if _is_chord_element(e)]
positions: list[str] = ["."] * n_positions
n = len(elements)
if n == 0:
return positions
def _expected_positions(time: str, subdivision: int) -> int:
"""Number of positions per bar for the given time signature and subdivision."""
num, denom = (int(x) for x in time.split("/"))
return (num * subdivision) // denom
def _section_to_bars(
section: _Section,
bar_duration: float,
time: str,
subdivision: int,
) -> Optional[list[list[str]]]:
"""Convert a section's chord events to a list of bars.
Returns None if any event contains an unrecognized Harte chord symbol;
the caller will skip the section and log a reason.
"""
positions_per_bar = _expected_positions(time, subdivision)
bars: list[list[str]] = []
for event in section.events:
if event.harte == "N":
first_pos = "NC"
elif event.harte == "X":
first_pos = "?"
for i, elem in enumerate(elements):
pos_idx = i * n_positions // n
if elem == ".":
continue # explicit hold — leave slot as "."
elif elem == "N":
if positions[pos_idx] == ".":
positions[pos_idx] = "NC"
elif elem == "X":
if positions[pos_idx] == ".":
positions[pos_idx] = "?"
else:
sym = _harte_to_chord_symbol(event.harte)
sym = _harte_to_chord_symbol(elem)
if sym is None:
log.debug(
"unrecognized Harte chord %r in section %s",
event.harte, section.letter,
)
log.debug("unrecognized Harte chord %r in bar %r", elem, bar_content)
return None
first_pos = sym
if positions[pos_idx] == ".":
positions[pos_idx] = sym
n_bars = max(1, round(event.duration / bar_duration))
bars.append([first_pos] + ["."] * (positions_per_bar - 1))
for _ in range(n_bars - 1):
# Hold chord across additional bars
bars.append(["."] * positions_per_bar)
return positions
return bars
def _is_chord_element(elem: str) -> bool:
"""True if elem is a chord token, hold marker, or NC/unknown."""
if elem in (".", "N", "X"):
return True
# Chord: starts with a note letter
return bool(elem) and elem[0] in "ABCDEFG"
# ---------------------------------------------------------------------------
@@ -380,45 +387,42 @@ def _section_to_bars(
# ---------------------------------------------------------------------------
def _infer_mode(tonic: str, sections: list[_Section]) -> str:
def _infer_mode(tonic: str, harte_chords: list[str]) -> str:
"""Determine 'major' or 'minor' from tonic chord quality distribution.
Counts occurrences of the tonic root in major-family vs minor-family
qualities across all sections. Returns 'major' on a tie or no data.
Returns 'major' on a tie or when no data is available.
"""
major_count = 0
minor_count = 0
for section in sections:
for event in section.events:
if not event.harte or event.harte in ("N", "X"):
continue
# Extract root without a full Harte parse
colon = event.harte.find(":")
root_part = event.harte[:colon] if colon != -1 else event.harte
root_str = root_part.split("/")[0]
if len(root_str) >= 2 and root_str[1] in "#b":
raw_root = root_str[:2]
else:
raw_root = root_str[:1]
if not raw_root:
continue
root = _normalize_note(raw_root)
if root != tonic:
continue
# Extract quality
quality_str = event.harte[colon + 1:] if colon != -1 else ""
if "/" in quality_str:
quality_str = quality_str[: quality_str.index("/")]
base = re.sub(r'\([^)]*\)', "", quality_str).strip()
result = _HARTE_QUALITY.get(base)
if result is None:
continue
our_quality = result[0]
if our_quality in _MAJOR_QUALITIES:
major_count += 1
elif our_quality in _MINOR_QUALITIES:
minor_count += 1
for harte in harte_chords:
if not harte or harte in ("N", "X", "."):
continue
colon = harte.find(":")
root_part = harte[:colon] if colon != -1 else harte
root_str = root_part.split("/")[0]
if len(root_str) >= 2 and root_str[1] in "#b":
raw_root = root_str[:2]
else:
raw_root = root_str[:1]
if not raw_root:
continue
root = _normalize_note(raw_root)
if root != tonic:
continue
quality_str = harte[colon + 1:] if colon != -1 else ""
slash_pos = quality_str.find("/")
if slash_pos != -1:
quality_str = quality_str[:slash_pos]
base = re.sub(r"\([^)]*\)", "", quality_str).strip()
result = _HARTE_QUALITY.get(base)
if result is None:
continue
our_quality = result[0]
if our_quality in _MAJOR_QUALITIES:
major_count += 1
elif our_quality in _MINOR_QUALITIES:
minor_count += 1
return "minor" if minor_count > major_count else "major"
@@ -441,6 +445,11 @@ def _parse_metre(metre: str) -> tuple[Optional[str], int]:
return None, 0
def _expected_positions(time: str, subdivision: int) -> int:
num, denom = (int(x) for x in time.split("/"))
return (num * subdivision) // denom
# ---------------------------------------------------------------------------
# File writing
# ---------------------------------------------------------------------------
@@ -455,7 +464,6 @@ def _write_chord_file(
function: Optional[str],
bars: list[list[str]],
) -> None:
"""Write a harmonic period to a .chord file."""
lines = [
f"# title: {title}",
f"# key: {key}",
@@ -463,12 +471,12 @@ def _write_chord_file(
f"# subdivision: {subdivision}",
"# style: other",
]
if function:
if function and function != "unspecified":
lines.append(f"# function: {function}")
lines.append("") # blank line before body
lines.append("")
for i in range(0, len(bars), 4):
chunk = bars[i : i + 4]
chunk = bars[i: i + 4]
line = " ".join(f"| {' '.join(b)}" for b in chunk) + " |"
lines.append(line)
@@ -484,8 +492,8 @@ def convert_song(song_dir: Path, output_dir: Path) -> int:
"""Convert one McGill Billboard song directory to .chord files.
Args:
song_dir: Directory containing salami_chords.txt (e.g. 0003/).
output_dir: Destination directory for .chord files (created if absent).
song_dir: Directory containing salami_chords.txt.
output_dir: Destination directory for .chord files.
Returns:
Number of .chord files successfully written.
@@ -496,13 +504,12 @@ def convert_song(song_dir: Path, output_dir: Path) -> int:
return 0
try:
header, raw_events = _parse_salami_file(salami)
header, data_lines = _parse_salami_file(salami)
except Exception as exc:
log.error("failed to parse %s: %s", salami, exc)
return 0
song_id = song_dir.name
time_sig, subdivision = _parse_metre(header.get("metre", "4/4"))
if time_sig is None:
log.warning(
@@ -513,57 +520,75 @@ def convert_song(song_dir: Path, output_dir: Path) -> int:
tonic_raw = header.get("tonic", "C").strip()
tonic = _normalize_note(tonic_raw) or "C"
sections = _extract_sections(raw_events)
if not sections:
log.warning("no sections found in %s", salami)
return 0
# Collect all Harte tokens for mode inference
all_harte: list[str] = []
for _, annotation in data_lines:
_, _, bar_groups = _parse_annotation_line(annotation)
for bg in bar_groups:
all_harte.extend(bg.split())
all_durations = [
e.duration
for s in sections
for e in s.events
if e.harte not in ("N", "X", "") and e.duration > 0.5
]
bar_duration = _estimate_bar_duration(all_durations)
mode = _infer_mode(tonic, sections)
mode = _infer_mode(tonic, all_harte)
key = f"{tonic}_{mode}"
artist = header.get("artist", "unknown")
song_title = header.get("title", "unknown")
n_positions = _expected_positions(time_sig, subdivision)
# Group annotation lines into sections
sections: list[tuple[str, list[list[str]]]] = []
current_function = "unspecified"
current_bars: list[list[str]] = []
current_valid = True
for _, annotation in data_lines:
letter, func, bar_groups = _parse_annotation_line(annotation)
if letter is not None:
# New section boundary — save current section if non-empty
if current_bars and current_valid:
sections.append((current_function, current_bars))
current_bars = []
current_valid = True
current_function = func if func is not None else "unspecified"
if not current_valid:
continue
for bg in bar_groups:
positions = _bar_str_to_positions(bg, n_positions)
if positions is None:
current_valid = False
break
current_bars.append(positions)
# Save the final section
if current_bars and current_valid:
sections.append((current_function, current_bars))
output_dir.mkdir(parents=True, exist_ok=True)
n_saved = 0
skip_reasons: Counter[str] = Counter()
for idx, section in enumerate(sections):
bars = _section_to_bars(section, bar_duration, time_sig, subdivision)
if bars is None:
skip_reasons["unrecognized_chord"] += 1
continue
for idx, (func, bars) in enumerate(sections):
n = len(bars)
if n < 4:
log.debug(
"section %s in %s: %d bar(s) < 4, skipping",
section.letter, song_id, n,
"section %d in %s: %d bar(s) < 4, skipping", idx, song_id, n
)
skip_reasons["too_short"] += 1
continue
if n > 16:
log.debug(
"section %s in %s: %d bars > 16, skipping",
section.letter, song_id, n,
"section %d in %s: %d bars > 16, skipping", idx, song_id, n
)
skip_reasons["too_long"] += 1
continue
func = section.function
filename = f"mcgill_{song_id}_{idx:02d}_{func}.chord"
out_path = output_dir / filename
period_title = f"{artist} - {song_title} ({section.letter},{func})"
period_title = f"{artist} - {song_title} ({func})"
_write_chord_file(
out_path, period_title, key, time_sig, subdivision,
func if func != "unspecified" else None, bars,
out_path, period_title, key, time_sig, subdivision, func, bars
)
n_saved += 1
log.debug("wrote %s", out_path.name)
@@ -581,10 +606,6 @@ def convert_song(song_dir: Path, output_dir: Path) -> int:
def convert_dataset(dataset_dir: Path, output_dir: Path) -> tuple[int, int]:
"""Convert all song directories in a McGill Billboard dataset.
Args:
dataset_dir: Root directory containing per-song subdirectories.
output_dir: Destination directory for .chord files.
Returns:
(n_saved, n_empty) where n_empty counts songs that produced no output.
"""
@@ -606,7 +627,7 @@ def convert_dataset(dataset_dir: Path, output_dir: Path) -> tuple[int, int]:
# ---------------------------------------------------------------------------
# CLI entry point
# CLI
# ---------------------------------------------------------------------------
if __name__ == "__main__":
@@ -615,7 +636,8 @@ if __name__ == "__main__":
epilog=(
"Example:\n"
" python -m src.external_converters.mcgill_to_chord "
"data/raw_external/mcgill/ --out data/raw_external/mcgill_converted/"
"data/raw_external/mcgill/billboard-2.0-salami_chords/ "
"--out data/raw_external/mcgill_chord/"
),
formatter_class=argparse.RawDescriptionHelpFormatter,
)
@@ -625,9 +647,9 @@ if __name__ == "__main__":
)
parser.add_argument(
"--out", type=Path,
default=Path("data/raw_external/mcgill_converted"),
default=Path("data/raw_external/mcgill_chord"),
metavar="output_dir",
help="destination for .chord files (default: data/raw_external/mcgill_converted/)",
help="destination for .chord files (default: data/raw_external/mcgill_chord/)",
)
parser.add_argument(
"--log-level", default="INFO",