feat: remove BAR token; bump spec to v2.3; fix max_seq_len
Bar boundaries are now implicit — the detokenizer counts positions per bar using TIME × SUB, and the generator gates EOS to bar boundaries only. Removing the deterministic BAR token reduces vocab size from 85 to 84 and lets the model focus on meaningful predictions. - src/tokenizer.py: drop BAR from VOCAB (85→84); replace BAR-based detokenize_to_period with position-counting logic; add write_chord_file; fix _tokens_to_symbol for add9/m(add9) qualities - tests/test_tokenizer.py: update vocab-size assertions to 84, structural token test, remove bar-count test, add test_no_bar_token_in_vocab - docs/chord_format_spec.md: bump to v2.3; document BAR removal in §5.2, §5.3, §5.4, §5.5, §5.6, §6.2, and changelog - CLAUDE.md: remove stale BAR reference, update vocab size to 84 - scripts/pretrain.py: raise max_seq_len 256→320 to cover regenerated McGill data (mean=83, max=283 tokens with BAR-free tokenizer) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
+9
-11
@@ -39,17 +39,17 @@ VALID_FIXTURES = [
|
||||
|
||||
|
||||
class TestVocabulary:
|
||||
def test_vocab_has_85_tokens(self):
|
||||
assert len(VOCAB) == 85
|
||||
def test_vocab_has_84_tokens(self):
|
||||
assert len(VOCAB) == 84
|
||||
|
||||
def test_no_duplicate_tokens(self):
|
||||
assert len(set(VOCAB)) == 85
|
||||
assert len(set(VOCAB)) == 84
|
||||
|
||||
def test_token_to_id_covers_all_vocab(self):
|
||||
assert len(TOKEN_TO_ID) == 85
|
||||
assert len(TOKEN_TO_ID) == 84
|
||||
|
||||
def test_id_to_token_covers_all_vocab(self):
|
||||
assert len(ID_TO_TOKEN) == 85
|
||||
assert len(ID_TO_TOKEN) == 84
|
||||
|
||||
def test_ids_are_contiguous_from_zero(self):
|
||||
for i, tok in enumerate(VOCAB):
|
||||
@@ -63,7 +63,7 @@ class TestVocabulary:
|
||||
assert VOCAB[:4] == ["<BOS>", "<EOS>", "<PAD>", "<UNK>"]
|
||||
|
||||
def test_structural_tokens_at_end(self):
|
||||
assert VOCAB[-3:] == ["HOLD", "NC", "BAR"]
|
||||
assert VOCAB[-2:] == ["HOLD", "NC"]
|
||||
|
||||
def test_all_roots_present(self):
|
||||
for note in ("C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"):
|
||||
@@ -112,10 +112,8 @@ class TestTokenizeStructure:
|
||||
assert toks[4] == "STYLE_other" # 'unspecified' is not in VOCAB → falls back to STYLE_other
|
||||
assert toks[5] == "FUNC_chorus"
|
||||
|
||||
def test_bar_token_count_matches_bar_count(self):
|
||||
p = parse_chord_file(FIXTURES / "valid_c_major.chord")
|
||||
ids = tokenize_period(p)
|
||||
assert sum(1 for i in ids if i == TOKEN_TO_ID["BAR"]) == len(p.bars)
|
||||
def test_no_bar_token_in_vocab(self):
|
||||
assert "BAR" not in TOKEN_TO_ID
|
||||
|
||||
def test_minor_period_emits_mode_minor(self):
|
||||
p = parse_chord_file(FIXTURES / "valid_b_minor.chord")
|
||||
@@ -130,7 +128,7 @@ class TestTokenizeStructure:
|
||||
def test_all_ids_in_vocab_range(self):
|
||||
for fixture_name in VALID_FIXTURES:
|
||||
p = parse_chord_file(FIXTURES / fixture_name)
|
||||
assert all(0 <= i < 85 for i in tokenize_period(p))
|
||||
assert all(0 <= i < 84 for i in tokenize_period(p))
|
||||
|
||||
def test_non_canonical_key_transposed_before_encoding(self):
|
||||
# F# major: first chord F#maj7 → Cmaj7 after shift=6; ROOT_C is at index 6.
|
||||
|
||||
Reference in New Issue
Block a user