data: update pretrained checkpoint results (BAR-free tokenizer)

Re-run pre-training results with the corrected 84-token vocabulary and
max_seq_len=320.  Previous checkpoint was trained on stale data with BAR
tokens and a corrupted tokenizer.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-20 14:28:00 +03:00
parent 4aead2ea20
commit 8a73394df9
3 changed files with 107 additions and 107 deletions
+50 -50
View File
@@ -1,51 +1,51 @@
epoch,train_loss,val_loss,val_ppl,lr,elapsed_s epoch,train_loss,val_loss,val_ppl,lr,elapsed_s
1,2.031937,0.808181,2.24,2.205000e-04,11.4 1,2.043105,0.860380,2.36,2.205000e-04,13.1
2,0.641424,0.550909,1.73,2.998721e-04,9.6 2,0.682436,0.587271,1.80,2.998721e-04,11.9
3,0.523860,0.496441,1.64,2.991598e-04,9.6 3,0.567941,0.544875,1.72,2.991598e-04,12.0
4,0.485698,0.472027,1.60,2.978255e-04,9.8 4,0.529446,0.512912,1.67,2.978255e-04,12.5
5,0.464184,0.447461,1.56,2.958747e-04,9.4 5,0.505409,0.490817,1.63,2.958747e-04,12.4
6,0.445964,0.434830,1.54,2.933156e-04,9.7 6,0.484891,0.471718,1.60,2.933156e-04,12.5
7,0.431950,0.417041,1.52,2.901587e-04,9.8 7,0.467122,0.456903,1.58,2.901587e-04,12.7
8,0.417696,0.409715,1.51,2.864174e-04,9.8 8,0.450230,0.442813,1.56,2.864174e-04,12.9
9,0.405625,0.396861,1.49,2.821072e-04,10.0 9,0.435896,0.428490,1.53,2.821072e-04,13.1
10,0.394811,0.391014,1.48,2.772460e-04,9.9 10,0.425630,0.420062,1.52,2.772460e-04,13.1
11,0.384599,0.378818,1.46,2.718542e-04,10.1 11,0.414810,0.411151,1.51,2.718542e-04,12.9
12,0.376229,0.370746,1.45,2.659542e-04,10.2 12,0.405492,0.409687,1.51,2.659542e-04,12.9
13,0.366664,0.364249,1.44,2.595706e-04,10.3 13,0.396882,0.391923,1.48,2.595706e-04,12.9
14,0.358899,0.353221,1.42,2.527301e-04,10.3 14,0.387616,0.387274,1.47,2.527301e-04,12.8
15,0.351163,0.345543,1.41,2.454612e-04,10.2 15,0.379135,0.385116,1.47,2.454612e-04,12.9
16,0.344542,0.343143,1.41,2.377941e-04,10.0 16,0.371748,0.374518,1.45,2.377941e-04,13.0
17,0.337549,0.336707,1.40,2.297610e-04,10.1 17,0.364497,0.367260,1.44,2.297610e-04,12.9
18,0.331382,0.332268,1.39,2.213952e-04,10.1 18,0.357427,0.364524,1.44,2.213952e-04,12.9
19,0.325570,0.322937,1.38,2.127316e-04,10.1 19,0.350312,0.358540,1.43,2.127316e-04,12.9
20,0.318492,0.319304,1.38,2.038065e-04,10.1 20,0.342951,0.349801,1.42,2.038065e-04,12.9
21,0.313770,0.315012,1.37,1.946569e-04,10.0 21,0.337651,0.343782,1.41,1.946569e-04,12.8
22,0.307178,0.311228,1.37,1.853211e-04,10.2 22,0.330809,0.337008,1.40,1.853211e-04,12.8
23,0.302469,0.303362,1.35,1.758381e-04,10.2 23,0.324771,0.332336,1.39,1.758381e-04,12.8
24,0.297134,0.302971,1.35,1.662472e-04,10.2 24,0.319391,0.324907,1.38,1.662472e-04,12.8
25,0.292665,0.292786,1.34,1.565886e-04,10.2 25,0.314073,0.321501,1.38,1.565886e-04,12.9
26,0.287050,0.289937,1.34,1.469026e-04,10.0 26,0.309813,0.317718,1.37,1.469026e-04,12.8
27,0.282454,0.289310,1.34,1.372294e-04,10.1 27,0.304261,0.313438,1.37,1.372294e-04,12.9
28,0.278259,0.286254,1.33,1.276095e-04,10.2 28,0.299998,0.310763,1.36,1.276095e-04,12.9
29,0.274782,0.282411,1.33,1.180830e-04,10.1 29,0.295039,0.307241,1.36,1.180830e-04,12.9
30,0.270312,0.278289,1.32,1.086896e-04,10.1 30,0.290108,0.303446,1.35,1.086896e-04,12.8
31,0.267001,0.274995,1.32,9.946846e-05,10.1 31,0.288020,0.302041,1.35,9.946846e-05,12.8
32,0.263096,0.271817,1.31,9.045806e-05,10.0 32,0.283507,0.299317,1.35,9.045806e-05,12.8
33,0.260614,0.269074,1.31,8.169597e-05,10.1 33,0.280522,0.294816,1.34,8.169597e-05,12.8
34,0.257799,0.269102,1.31,7.321873e-05,10.1 34,0.275877,0.291919,1.34,7.321873e-05,12.9
35,0.253950,0.266719,1.31,6.506170e-05,10.2 35,0.273687,0.288819,1.33,6.506170e-05,12.8
36,0.251757,0.264989,1.30,5.725888e-05,10.1 36,0.270566,0.287831,1.33,5.725888e-05,13.0
37,0.249786,0.263033,1.30,4.984283e-05,10.0 37,0.267893,0.286515,1.33,4.984283e-05,13.0
38,0.247241,0.260050,1.30,4.284447e-05,10.1 38,0.265996,0.284756,1.33,4.284447e-05,13.0
39,0.245589,0.258710,1.30,3.629298e-05,10.2 39,0.264527,0.283663,1.33,3.629298e-05,13.0
40,0.243220,0.258440,1.29,3.021569e-05,10.1 40,0.262261,0.282717,1.33,3.021569e-05,12.9
41,0.242131,0.257187,1.29,2.463794e-05,10.1 41,0.260812,0.282175,1.33,2.463794e-05,12.8
42,0.240936,0.256695,1.29,1.958300e-05,10.1 42,0.258872,0.280704,1.32,1.958300e-05,12.8
43,0.239800,0.255997,1.29,1.507193e-05,10.0 43,0.257864,0.280204,1.32,1.507193e-05,12.8
44,0.238705,0.255310,1.29,1.112356e-05,10.1 44,0.256770,0.279358,1.32,1.112356e-05,12.8
45,0.238149,0.254971,1.29,7.754357e-06,10.1 45,0.254942,0.279263,1.32,7.754357e-06,13.0
46,0.237226,0.254995,1.29,4.978363e-06,10.1 46,0.255560,0.278873,1.32,4.978363e-06,12.8
47,0.236467,0.254608,1.29,2.807158e-06,10.2 47,0.255011,0.278650,1.32,2.807158e-06,12.9
48,0.236280,0.254222,1.29,1.249797e-06,9.9 48,0.254304,0.278583,1.32,1.249797e-06,12.8
49,0.235897,0.254291,1.29,3.127754e-07,10.2 49,0.252442,0.278481,1.32,3.127754e-07,12.8
50,0.237100,0.254293,1.29,0.000000e+00,10.1 50,0.253867,0.278494,1.32,0.000000e+00,12.8
1 epoch train_loss val_loss val_ppl lr elapsed_s
2 1 2.031937 2.043105 0.808181 0.860380 2.24 2.36 2.205000e-04 11.4 13.1
3 2 0.641424 0.682436 0.550909 0.587271 1.73 1.80 2.998721e-04 9.6 11.9
4 3 0.523860 0.567941 0.496441 0.544875 1.64 1.72 2.991598e-04 9.6 12.0
5 4 0.485698 0.529446 0.472027 0.512912 1.60 1.67 2.978255e-04 9.8 12.5
6 5 0.464184 0.505409 0.447461 0.490817 1.56 1.63 2.958747e-04 9.4 12.4
7 6 0.445964 0.484891 0.434830 0.471718 1.54 1.60 2.933156e-04 9.7 12.5
8 7 0.431950 0.467122 0.417041 0.456903 1.52 1.58 2.901587e-04 9.8 12.7
9 8 0.417696 0.450230 0.409715 0.442813 1.51 1.56 2.864174e-04 9.8 12.9
10 9 0.405625 0.435896 0.396861 0.428490 1.49 1.53 2.821072e-04 10.0 13.1
11 10 0.394811 0.425630 0.391014 0.420062 1.48 1.52 2.772460e-04 9.9 13.1
12 11 0.384599 0.414810 0.378818 0.411151 1.46 1.51 2.718542e-04 10.1 12.9
13 12 0.376229 0.405492 0.370746 0.409687 1.45 1.51 2.659542e-04 10.2 12.9
14 13 0.366664 0.396882 0.364249 0.391923 1.44 1.48 2.595706e-04 10.3 12.9
15 14 0.358899 0.387616 0.353221 0.387274 1.42 1.47 2.527301e-04 10.3 12.8
16 15 0.351163 0.379135 0.345543 0.385116 1.41 1.47 2.454612e-04 10.2 12.9
17 16 0.344542 0.371748 0.343143 0.374518 1.41 1.45 2.377941e-04 10.0 13.0
18 17 0.337549 0.364497 0.336707 0.367260 1.40 1.44 2.297610e-04 10.1 12.9
19 18 0.331382 0.357427 0.332268 0.364524 1.39 1.44 2.213952e-04 10.1 12.9
20 19 0.325570 0.350312 0.322937 0.358540 1.38 1.43 2.127316e-04 10.1 12.9
21 20 0.318492 0.342951 0.319304 0.349801 1.38 1.42 2.038065e-04 10.1 12.9
22 21 0.313770 0.337651 0.315012 0.343782 1.37 1.41 1.946569e-04 10.0 12.8
23 22 0.307178 0.330809 0.311228 0.337008 1.37 1.40 1.853211e-04 10.2 12.8
24 23 0.302469 0.324771 0.303362 0.332336 1.35 1.39 1.758381e-04 10.2 12.8
25 24 0.297134 0.319391 0.302971 0.324907 1.35 1.38 1.662472e-04 10.2 12.8
26 25 0.292665 0.314073 0.292786 0.321501 1.34 1.38 1.565886e-04 10.2 12.9
27 26 0.287050 0.309813 0.289937 0.317718 1.34 1.37 1.469026e-04 10.0 12.8
28 27 0.282454 0.304261 0.289310 0.313438 1.34 1.37 1.372294e-04 10.1 12.9
29 28 0.278259 0.299998 0.286254 0.310763 1.33 1.36 1.276095e-04 10.2 12.9
30 29 0.274782 0.295039 0.282411 0.307241 1.33 1.36 1.180830e-04 10.1 12.9
31 30 0.270312 0.290108 0.278289 0.303446 1.32 1.35 1.086896e-04 10.1 12.8
32 31 0.267001 0.288020 0.274995 0.302041 1.32 1.35 9.946846e-05 10.1 12.8
33 32 0.263096 0.283507 0.271817 0.299317 1.31 1.35 9.045806e-05 10.0 12.8
34 33 0.260614 0.280522 0.269074 0.294816 1.31 1.34 8.169597e-05 10.1 12.8
35 34 0.257799 0.275877 0.269102 0.291919 1.31 1.34 7.321873e-05 10.1 12.9
36 35 0.253950 0.273687 0.266719 0.288819 1.31 1.33 6.506170e-05 10.2 12.8
37 36 0.251757 0.270566 0.264989 0.287831 1.30 1.33 5.725888e-05 10.1 13.0
38 37 0.249786 0.267893 0.263033 0.286515 1.30 1.33 4.984283e-05 10.0 13.0
39 38 0.247241 0.265996 0.260050 0.284756 1.30 1.33 4.284447e-05 10.1 13.0
40 39 0.245589 0.264527 0.258710 0.283663 1.30 1.33 3.629298e-05 10.2 13.0
41 40 0.243220 0.262261 0.258440 0.282717 1.29 1.33 3.021569e-05 10.1 12.9
42 41 0.242131 0.260812 0.257187 0.282175 1.29 1.33 2.463794e-05 10.1 12.8
43 42 0.240936 0.258872 0.256695 0.280704 1.29 1.32 1.958300e-05 10.1 12.8
44 43 0.239800 0.257864 0.255997 0.280204 1.29 1.32 1.507193e-05 10.0 12.8
45 44 0.238705 0.256770 0.255310 0.279358 1.29 1.32 1.112356e-05 10.1 12.8
46 45 0.238149 0.254942 0.254971 0.279263 1.29 1.32 7.754357e-06 10.1 13.0
47 46 0.237226 0.255560 0.254995 0.278873 1.29 1.32 4.978363e-06 10.1 12.8
48 47 0.236467 0.255011 0.254608 0.278650 1.29 1.32 2.807158e-06 10.2 12.9
49 48 0.236280 0.254304 0.254222 0.278583 1.29 1.32 1.249797e-06 9.9 12.8
50 49 0.235897 0.252442 0.254291 0.278481 1.29 1.32 3.127754e-07 10.2 12.8
51 50 0.237100 0.253867 0.254293 0.278494 1.29 1.32 0.000000e+00 10.1 12.8
+55 -55
View File
@@ -3,65 +3,65 @@
PRE-TRAINING REPORT PRE-TRAINING REPORT
==================================================== ====================================================
Total epochs run : 50 Total epochs run : 50
Best epoch (val loss) : 48 Best epoch (val loss) : 49
Convergence epoch : 42 (val ≤ best+1 %) Convergence epoch : 42 (val ≤ best+1 %)
Best val loss : 0.2542 Best val loss : 0.2785
Best val perplexity : 1.29 Best val perplexity : 1.32
Final train loss : 0.2371 Final train loss : 0.2539
Unique parameters : 1,384,128 Unique parameters : 1,396,416
Checkpoint : checkpoints/pretrained.pt Checkpoint : checkpoints/pretrained.pt
Log CSV : checkpoints/pretrained.log.csv Log CSV : checkpoints/pretrained.log.csv
==================================================== ====================================================
epoch train val ppl lr epoch train val ppl lr
----- -------- -------- ------- ---------- ----- -------- -------- ------- ----------
1 2.0319 0.8082 2.24 2.20e-04 1 2.0431 0.8604 2.36 2.20e-04
2 0.6414 0.5509 1.73 3.00e-04 2 0.6824 0.5873 1.80 3.00e-04
3 0.5239 0.4964 1.64 2.99e-04 3 0.5679 0.5449 1.72 2.99e-04
4 0.4857 0.4720 1.60 2.98e-04 4 0.5294 0.5129 1.67 2.98e-04
5 0.4642 0.4475 1.56 2.96e-04 5 0.5054 0.4908 1.63 2.96e-04
6 0.4460 0.4348 1.54 2.93e-04 6 0.4849 0.4717 1.60 2.93e-04
7 0.4320 0.4170 1.52 2.90e-04 7 0.4671 0.4569 1.58 2.90e-04
8 0.4177 0.4097 1.51 2.86e-04 8 0.4502 0.4428 1.56 2.86e-04
9 0.4056 0.3969 1.49 2.82e-04 9 0.4359 0.4285 1.53 2.82e-04
10 0.3948 0.3910 1.48 2.77e-04 10 0.4256 0.4201 1.52 2.77e-04
11 0.3846 0.3788 1.46 2.72e-04 11 0.4148 0.4112 1.51 2.72e-04
12 0.3762 0.3707 1.45 2.66e-04 12 0.4055 0.4097 1.51 2.66e-04
13 0.3667 0.3642 1.44 2.60e-04 13 0.3969 0.3919 1.48 2.60e-04
14 0.3589 0.3532 1.42 2.53e-04 14 0.3876 0.3873 1.47 2.53e-04
15 0.3512 0.3455 1.41 2.45e-04 15 0.3791 0.3851 1.47 2.45e-04
16 0.3445 0.3431 1.41 2.38e-04 16 0.3717 0.3745 1.45 2.38e-04
17 0.3375 0.3367 1.40 2.30e-04 17 0.3645 0.3673 1.44 2.30e-04
18 0.3314 0.3323 1.39 2.21e-04 18 0.3574 0.3645 1.44 2.21e-04
19 0.3256 0.3229 1.38 2.13e-04 19 0.3503 0.3585 1.43 2.13e-04
20 0.3185 0.3193 1.38 2.04e-04 20 0.3430 0.3498 1.42 2.04e-04
21 0.3138 0.3150 1.37 1.95e-04 21 0.3377 0.3438 1.41 1.95e-04
22 0.3072 0.3112 1.37 1.85e-04 22 0.3308 0.3370 1.40 1.85e-04
23 0.3025 0.3034 1.35 1.76e-04 23 0.3248 0.3323 1.39 1.76e-04
24 0.2971 0.3030 1.35 1.66e-04 24 0.3194 0.3249 1.38 1.66e-04
25 0.2927 0.2928 1.34 1.57e-04 25 0.3141 0.3215 1.38 1.57e-04
26 0.2871 0.2899 1.34 1.47e-04 26 0.3098 0.3177 1.37 1.47e-04
27 0.2825 0.2893 1.34 1.37e-04 27 0.3043 0.3134 1.37 1.37e-04
28 0.2783 0.2863 1.33 1.28e-04 28 0.3000 0.3108 1.36 1.28e-04
29 0.2748 0.2824 1.33 1.18e-04 29 0.2950 0.3072 1.36 1.18e-04
30 0.2703 0.2783 1.32 1.09e-04 30 0.2901 0.3034 1.35 1.09e-04
31 0.2670 0.2750 1.32 9.95e-05 31 0.2880 0.3020 1.35 9.95e-05
32 0.2631 0.2718 1.31 9.05e-05 32 0.2835 0.2993 1.35 9.05e-05
33 0.2606 0.2691 1.31 8.17e-05 33 0.2805 0.2948 1.34 8.17e-05
34 0.2578 0.2691 1.31 7.32e-05 34 0.2759 0.2919 1.34 7.32e-05
35 0.2540 0.2667 1.31 6.51e-05 35 0.2737 0.2888 1.33 6.51e-05
36 0.2518 0.2650 1.30 5.73e-05 36 0.2706 0.2878 1.33 5.73e-05
37 0.2498 0.2630 1.30 4.98e-05 37 0.2679 0.2865 1.33 4.98e-05
38 0.2472 0.2601 1.30 4.28e-05 38 0.2660 0.2848 1.33 4.28e-05
39 0.2456 0.2587 1.30 3.63e-05 39 0.2645 0.2837 1.33 3.63e-05
40 0.2432 0.2584 1.29 3.02e-05 40 0.2623 0.2827 1.33 3.02e-05
41 0.2421 0.2572 1.29 2.46e-05 41 0.2608 0.2822 1.33 2.46e-05
42 0.2409 0.2567 1.29 1.96e-05 42 0.2589 0.2807 1.32 1.96e-05
43 0.2398 0.2560 1.29 1.51e-05 43 0.2579 0.2802 1.32 1.51e-05
44 0.2387 0.2553 1.29 1.11e-05 44 0.2568 0.2794 1.32 1.11e-05
45 0.2381 0.2550 1.29 7.75e-06 45 0.2549 0.2793 1.32 7.75e-06
46 0.2372 0.2550 1.29 4.98e-06 46 0.2556 0.2789 1.32 4.98e-06
47 0.2365 0.2546 1.29 2.81e-06 47 0.2550 0.2787 1.32 2.81e-06
48 0.2363 0.2542 1.29 1.25e-06 48 0.2543 0.2786 1.32 1.25e-06
49 0.2359 0.2543 1.29 3.13e-07 49 0.2524 0.2785 1.32 3.13e-07
50 0.2371 0.2543 1.29 0.00e+00 50 0.2539 0.2785 1.32 0.00e+00
Binary file not shown.