Model: fpadovani/ind-latn-100mb-after-ppt-shuff-dyck-10mb-ckpt500_seed3407 Source: Original Platform
14189 lines
387 KiB
JSON
14189 lines
387 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 5.485893416927899,
|
|
"eval_steps": 500,
|
|
"global_step": 7000,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"entropy": 4.789229106903076,
|
|
"epoch": 0.003918495297805642,
|
|
"grad_norm": 17.125,
|
|
"learning_rate": 2e-06,
|
|
"loss": 14.3537,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 9174.0,
|
|
"step": 5
|
|
},
|
|
{
|
|
"entropy": 4.8115012645721436,
|
|
"epoch": 0.007836990595611285,
|
|
"grad_norm": 19.625,
|
|
"learning_rate": 4.5e-06,
|
|
"loss": 14.2452,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 17790.0,
|
|
"step": 10
|
|
},
|
|
{
|
|
"entropy": 4.899150800704956,
|
|
"epoch": 0.011755485893416929,
|
|
"grad_norm": 24.25,
|
|
"learning_rate": 7e-06,
|
|
"loss": 13.9044,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 25850.0,
|
|
"step": 15
|
|
},
|
|
{
|
|
"entropy": 5.367580604553223,
|
|
"epoch": 0.01567398119122257,
|
|
"grad_norm": 32.5,
|
|
"learning_rate": 9.5e-06,
|
|
"loss": 13.1444,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 35194.0,
|
|
"step": 20
|
|
},
|
|
{
|
|
"entropy": 8.583788537979126,
|
|
"epoch": 0.019592476489028215,
|
|
"grad_norm": 7.5,
|
|
"learning_rate": 1.2e-05,
|
|
"loss": 11.3911,
|
|
"mean_token_accuracy": 0.00023256096756085753,
|
|
"num_tokens": 44218.0,
|
|
"step": 25
|
|
},
|
|
{
|
|
"entropy": 10.630141735076904,
|
|
"epoch": 0.023510971786833857,
|
|
"grad_norm": 3.234375,
|
|
"learning_rate": 1.4500000000000002e-05,
|
|
"loss": 10.7102,
|
|
"mean_token_accuracy": 0.014597209030762314,
|
|
"num_tokens": 53397.0,
|
|
"step": 30
|
|
},
|
|
{
|
|
"entropy": 10.695956897735595,
|
|
"epoch": 0.0274294670846395,
|
|
"grad_norm": 3.0,
|
|
"learning_rate": 1.7000000000000003e-05,
|
|
"loss": 10.4664,
|
|
"mean_token_accuracy": 0.01781447734683752,
|
|
"num_tokens": 62749.0,
|
|
"step": 35
|
|
},
|
|
{
|
|
"entropy": 10.673796558380127,
|
|
"epoch": 0.03134796238244514,
|
|
"grad_norm": 2.421875,
|
|
"learning_rate": 1.95e-05,
|
|
"loss": 10.1632,
|
|
"mean_token_accuracy": 0.0182854525744915,
|
|
"num_tokens": 71721.0,
|
|
"step": 40
|
|
},
|
|
{
|
|
"entropy": 10.631663513183593,
|
|
"epoch": 0.03526645768025078,
|
|
"grad_norm": 2.421875,
|
|
"learning_rate": 2.2e-05,
|
|
"loss": 9.8792,
|
|
"mean_token_accuracy": 0.03653257880359888,
|
|
"num_tokens": 79844.0,
|
|
"step": 45
|
|
},
|
|
{
|
|
"entropy": 10.549837970733643,
|
|
"epoch": 0.03918495297805643,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 2.4500000000000003e-05,
|
|
"loss": 9.7665,
|
|
"mean_token_accuracy": 0.04605128690600395,
|
|
"num_tokens": 88866.0,
|
|
"step": 50
|
|
},
|
|
{
|
|
"entropy": 10.509830379486084,
|
|
"epoch": 0.04310344827586207,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 2.7e-05,
|
|
"loss": 9.6605,
|
|
"mean_token_accuracy": 0.044031094387173654,
|
|
"num_tokens": 97918.0,
|
|
"step": 55
|
|
},
|
|
{
|
|
"entropy": 10.524475193023681,
|
|
"epoch": 0.047021943573667714,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 2.95e-05,
|
|
"loss": 9.5619,
|
|
"mean_token_accuracy": 0.04601282589137554,
|
|
"num_tokens": 107043.0,
|
|
"step": 60
|
|
},
|
|
{
|
|
"entropy": 10.512676334381103,
|
|
"epoch": 0.050940438871473356,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 3.2e-05,
|
|
"loss": 9.4987,
|
|
"mean_token_accuracy": 0.04643600396811962,
|
|
"num_tokens": 116000.0,
|
|
"step": 65
|
|
},
|
|
{
|
|
"entropy": 10.477371215820312,
|
|
"epoch": 0.054858934169279,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 3.4500000000000005e-05,
|
|
"loss": 9.4179,
|
|
"mean_token_accuracy": 0.04148977212607861,
|
|
"num_tokens": 124559.0,
|
|
"step": 70
|
|
},
|
|
{
|
|
"entropy": 10.470399188995362,
|
|
"epoch": 0.05877742946708464,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 3.7e-05,
|
|
"loss": 9.2945,
|
|
"mean_token_accuracy": 0.04952896051108837,
|
|
"num_tokens": 132868.0,
|
|
"step": 75
|
|
},
|
|
{
|
|
"entropy": 10.453096103668212,
|
|
"epoch": 0.06269592476489028,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 3.95e-05,
|
|
"loss": 9.2848,
|
|
"mean_token_accuracy": 0.05274602882564068,
|
|
"num_tokens": 141286.0,
|
|
"step": 80
|
|
},
|
|
{
|
|
"entropy": 10.431593227386475,
|
|
"epoch": 0.06661442006269593,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 4.2000000000000004e-05,
|
|
"loss": 9.1405,
|
|
"mean_token_accuracy": 0.05872356928884983,
|
|
"num_tokens": 150406.0,
|
|
"step": 85
|
|
},
|
|
{
|
|
"entropy": 10.36865291595459,
|
|
"epoch": 0.07053291536050156,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 4.45e-05,
|
|
"loss": 9.0678,
|
|
"mean_token_accuracy": 0.059385529905557635,
|
|
"num_tokens": 158770.0,
|
|
"step": 90
|
|
},
|
|
{
|
|
"entropy": 10.264866065979003,
|
|
"epoch": 0.07445141065830721,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 4.7000000000000004e-05,
|
|
"loss": 8.9633,
|
|
"mean_token_accuracy": 0.06288341507315635,
|
|
"num_tokens": 167763.0,
|
|
"step": 95
|
|
},
|
|
{
|
|
"entropy": 10.183263969421386,
|
|
"epoch": 0.07836990595611286,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 4.9500000000000004e-05,
|
|
"loss": 8.819,
|
|
"mean_token_accuracy": 0.0607046652585268,
|
|
"num_tokens": 177306.0,
|
|
"step": 100
|
|
},
|
|
{
|
|
"entropy": 10.144334697723389,
|
|
"epoch": 0.0822884012539185,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 5.2e-05,
|
|
"loss": 8.7349,
|
|
"mean_token_accuracy": 0.06028640605509281,
|
|
"num_tokens": 186014.0,
|
|
"step": 105
|
|
},
|
|
{
|
|
"entropy": 10.06214361190796,
|
|
"epoch": 0.08620689655172414,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 5.45e-05,
|
|
"loss": 8.5758,
|
|
"mean_token_accuracy": 0.06410923898220063,
|
|
"num_tokens": 194122.0,
|
|
"step": 110
|
|
},
|
|
{
|
|
"entropy": 9.952830028533935,
|
|
"epoch": 0.09012539184952978,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 5.7e-05,
|
|
"loss": 8.4698,
|
|
"mean_token_accuracy": 0.05936008468270302,
|
|
"num_tokens": 203097.0,
|
|
"step": 115
|
|
},
|
|
{
|
|
"entropy": 9.820581531524658,
|
|
"epoch": 0.09404388714733543,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 5.9499999999999996e-05,
|
|
"loss": 8.3405,
|
|
"mean_token_accuracy": 0.06221077479422092,
|
|
"num_tokens": 211413.0,
|
|
"step": 120
|
|
},
|
|
{
|
|
"entropy": 9.732498264312744,
|
|
"epoch": 0.09796238244514106,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 6.2e-05,
|
|
"loss": 8.2718,
|
|
"mean_token_accuracy": 0.061625415459275246,
|
|
"num_tokens": 220550.0,
|
|
"step": 125
|
|
},
|
|
{
|
|
"entropy": 9.504752349853515,
|
|
"epoch": 0.10188087774294671,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 6.450000000000001e-05,
|
|
"loss": 8.0995,
|
|
"mean_token_accuracy": 0.0649514563381672,
|
|
"num_tokens": 229197.0,
|
|
"step": 130
|
|
},
|
|
{
|
|
"entropy": 9.307702922821045,
|
|
"epoch": 0.10579937304075235,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 6.7e-05,
|
|
"loss": 8.0979,
|
|
"mean_token_accuracy": 0.05685936994850636,
|
|
"num_tokens": 238479.0,
|
|
"step": 135
|
|
},
|
|
{
|
|
"entropy": 9.162922954559326,
|
|
"epoch": 0.109717868338558,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 6.950000000000001e-05,
|
|
"loss": 7.9442,
|
|
"mean_token_accuracy": 0.059861503541469574,
|
|
"num_tokens": 246318.0,
|
|
"step": 140
|
|
},
|
|
{
|
|
"entropy": 8.96123743057251,
|
|
"epoch": 0.11363636363636363,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 7.2e-05,
|
|
"loss": 7.9513,
|
|
"mean_token_accuracy": 0.05959276556968689,
|
|
"num_tokens": 254783.0,
|
|
"step": 145
|
|
},
|
|
{
|
|
"entropy": 8.760778617858886,
|
|
"epoch": 0.11755485893416928,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 7.45e-05,
|
|
"loss": 7.7945,
|
|
"mean_token_accuracy": 0.06369670145213605,
|
|
"num_tokens": 263416.0,
|
|
"step": 150
|
|
},
|
|
{
|
|
"entropy": 8.68117027282715,
|
|
"epoch": 0.12147335423197492,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 7.7e-05,
|
|
"loss": 7.8147,
|
|
"mean_token_accuracy": 0.0631796333938837,
|
|
"num_tokens": 271930.0,
|
|
"step": 155
|
|
},
|
|
{
|
|
"entropy": 8.476777839660645,
|
|
"epoch": 0.12539184952978055,
|
|
"grad_norm": 0.78515625,
|
|
"learning_rate": 7.950000000000001e-05,
|
|
"loss": 7.7159,
|
|
"mean_token_accuracy": 0.06549291461706161,
|
|
"num_tokens": 280546.0,
|
|
"step": 160
|
|
},
|
|
{
|
|
"entropy": 8.308262157440186,
|
|
"epoch": 0.12931034482758622,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 8.2e-05,
|
|
"loss": 7.7078,
|
|
"mean_token_accuracy": 0.06602046675980092,
|
|
"num_tokens": 288813.0,
|
|
"step": 165
|
|
},
|
|
{
|
|
"entropy": 8.279962158203125,
|
|
"epoch": 0.13322884012539185,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 8.450000000000001e-05,
|
|
"loss": 7.6847,
|
|
"mean_token_accuracy": 0.06443305909633637,
|
|
"num_tokens": 297966.0,
|
|
"step": 170
|
|
},
|
|
{
|
|
"entropy": 8.152728843688966,
|
|
"epoch": 0.1371473354231975,
|
|
"grad_norm": 0.8671875,
|
|
"learning_rate": 8.7e-05,
|
|
"loss": 7.7467,
|
|
"mean_token_accuracy": 0.06189337000250816,
|
|
"num_tokens": 307135.0,
|
|
"step": 175
|
|
},
|
|
{
|
|
"entropy": 8.145699501037598,
|
|
"epoch": 0.14106583072100312,
|
|
"grad_norm": 0.890625,
|
|
"learning_rate": 8.95e-05,
|
|
"loss": 7.6581,
|
|
"mean_token_accuracy": 0.06488074697554111,
|
|
"num_tokens": 315546.0,
|
|
"step": 180
|
|
},
|
|
{
|
|
"entropy": 8.149786376953125,
|
|
"epoch": 0.14498432601880878,
|
|
"grad_norm": 0.89453125,
|
|
"learning_rate": 9.2e-05,
|
|
"loss": 7.6538,
|
|
"mean_token_accuracy": 0.06405953019857406,
|
|
"num_tokens": 323930.0,
|
|
"step": 185
|
|
},
|
|
{
|
|
"entropy": 7.983444690704346,
|
|
"epoch": 0.14890282131661442,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 9.45e-05,
|
|
"loss": 7.5166,
|
|
"mean_token_accuracy": 0.07129846066236496,
|
|
"num_tokens": 332419.0,
|
|
"step": 190
|
|
},
|
|
{
|
|
"entropy": 7.974339866638184,
|
|
"epoch": 0.15282131661442006,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 9.7e-05,
|
|
"loss": 7.6157,
|
|
"mean_token_accuracy": 0.06940566822886467,
|
|
"num_tokens": 341362.0,
|
|
"step": 195
|
|
},
|
|
{
|
|
"entropy": 7.973450374603272,
|
|
"epoch": 0.15673981191222572,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 9.95e-05,
|
|
"loss": 7.5198,
|
|
"mean_token_accuracy": 0.07214542552828788,
|
|
"num_tokens": 349395.0,
|
|
"step": 200
|
|
},
|
|
{
|
|
"entropy": 7.946638202667236,
|
|
"epoch": 0.16065830721003135,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.000102,
|
|
"loss": 7.545,
|
|
"mean_token_accuracy": 0.06696730926632881,
|
|
"num_tokens": 358413.0,
|
|
"step": 205
|
|
},
|
|
{
|
|
"entropy": 7.863241577148438,
|
|
"epoch": 0.164576802507837,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00010449999999999999,
|
|
"loss": 7.6316,
|
|
"mean_token_accuracy": 0.06982938721776008,
|
|
"num_tokens": 366489.0,
|
|
"step": 210
|
|
},
|
|
{
|
|
"entropy": 7.940403842926026,
|
|
"epoch": 0.16849529780564262,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000107,
|
|
"loss": 7.5023,
|
|
"mean_token_accuracy": 0.06740010716021061,
|
|
"num_tokens": 375335.0,
|
|
"step": 215
|
|
},
|
|
{
|
|
"entropy": 7.874079847335816,
|
|
"epoch": 0.1724137931034483,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0001095,
|
|
"loss": 7.5555,
|
|
"mean_token_accuracy": 0.07188675999641418,
|
|
"num_tokens": 384276.0,
|
|
"step": 220
|
|
},
|
|
{
|
|
"entropy": 7.911562585830689,
|
|
"epoch": 0.17633228840125392,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.000112,
|
|
"loss": 7.5929,
|
|
"mean_token_accuracy": 0.06714313849806786,
|
|
"num_tokens": 393571.0,
|
|
"step": 225
|
|
},
|
|
{
|
|
"entropy": 7.920306205749512,
|
|
"epoch": 0.18025078369905956,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0001145,
|
|
"loss": 7.5193,
|
|
"mean_token_accuracy": 0.07089398205280303,
|
|
"num_tokens": 401865.0,
|
|
"step": 230
|
|
},
|
|
{
|
|
"entropy": 7.848536252975464,
|
|
"epoch": 0.1841692789968652,
|
|
"grad_norm": 0.91015625,
|
|
"learning_rate": 0.00011700000000000001,
|
|
"loss": 7.4905,
|
|
"mean_token_accuracy": 0.07226377129554748,
|
|
"num_tokens": 410518.0,
|
|
"step": 235
|
|
},
|
|
{
|
|
"entropy": 7.869985485076905,
|
|
"epoch": 0.18808777429467086,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.00011949999999999999,
|
|
"loss": 7.4997,
|
|
"mean_token_accuracy": 0.07303371652960777,
|
|
"num_tokens": 419769.0,
|
|
"step": 240
|
|
},
|
|
{
|
|
"entropy": 7.837644481658936,
|
|
"epoch": 0.1920062695924765,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.000122,
|
|
"loss": 7.437,
|
|
"mean_token_accuracy": 0.0742616519331932,
|
|
"num_tokens": 428204.0,
|
|
"step": 245
|
|
},
|
|
{
|
|
"entropy": 7.897941255569458,
|
|
"epoch": 0.19592476489028213,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0001245,
|
|
"loss": 7.5267,
|
|
"mean_token_accuracy": 0.06978406608104706,
|
|
"num_tokens": 436594.0,
|
|
"step": 250
|
|
},
|
|
{
|
|
"entropy": 7.80897855758667,
|
|
"epoch": 0.19984326018808776,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.000127,
|
|
"loss": 7.3256,
|
|
"mean_token_accuracy": 0.07486266531050205,
|
|
"num_tokens": 444645.0,
|
|
"step": 255
|
|
},
|
|
{
|
|
"entropy": 7.8133704662323,
|
|
"epoch": 0.20376175548589343,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0001295,
|
|
"loss": 7.3726,
|
|
"mean_token_accuracy": 0.07738698273897171,
|
|
"num_tokens": 453016.0,
|
|
"step": 260
|
|
},
|
|
{
|
|
"entropy": 7.736161422729492,
|
|
"epoch": 0.20768025078369906,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.000132,
|
|
"loss": 7.4507,
|
|
"mean_token_accuracy": 0.06978621035814285,
|
|
"num_tokens": 462116.0,
|
|
"step": 265
|
|
},
|
|
{
|
|
"entropy": 7.664476203918457,
|
|
"epoch": 0.2115987460815047,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00013450000000000002,
|
|
"loss": 7.3961,
|
|
"mean_token_accuracy": 0.07115238644182682,
|
|
"num_tokens": 470807.0,
|
|
"step": 270
|
|
},
|
|
{
|
|
"entropy": 7.7677568912506105,
|
|
"epoch": 0.21551724137931033,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00013700000000000002,
|
|
"loss": 7.4818,
|
|
"mean_token_accuracy": 0.07023664973676205,
|
|
"num_tokens": 479592.0,
|
|
"step": 275
|
|
},
|
|
{
|
|
"entropy": 7.912611389160157,
|
|
"epoch": 0.219435736677116,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0001395,
|
|
"loss": 7.4068,
|
|
"mean_token_accuracy": 0.07160313390195369,
|
|
"num_tokens": 488107.0,
|
|
"step": 280
|
|
},
|
|
{
|
|
"entropy": 7.755217599868774,
|
|
"epoch": 0.22335423197492163,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00014199999999999998,
|
|
"loss": 7.4212,
|
|
"mean_token_accuracy": 0.07538670524954796,
|
|
"num_tokens": 496775.0,
|
|
"step": 285
|
|
},
|
|
{
|
|
"entropy": 7.762103033065796,
|
|
"epoch": 0.22727272727272727,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0001445,
|
|
"loss": 7.4447,
|
|
"mean_token_accuracy": 0.07036296911537647,
|
|
"num_tokens": 505415.0,
|
|
"step": 290
|
|
},
|
|
{
|
|
"entropy": 7.757038116455078,
|
|
"epoch": 0.23119122257053293,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000147,
|
|
"loss": 7.4344,
|
|
"mean_token_accuracy": 0.074312524497509,
|
|
"num_tokens": 514447.0,
|
|
"step": 295
|
|
},
|
|
{
|
|
"entropy": 7.7855620861053465,
|
|
"epoch": 0.23510971786833856,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0001495,
|
|
"loss": 7.4189,
|
|
"mean_token_accuracy": 0.07684484757483005,
|
|
"num_tokens": 522998.0,
|
|
"step": 300
|
|
},
|
|
{
|
|
"entropy": 7.774819326400757,
|
|
"epoch": 0.2390282131661442,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000152,
|
|
"loss": 7.3794,
|
|
"mean_token_accuracy": 0.07512850686907768,
|
|
"num_tokens": 531542.0,
|
|
"step": 305
|
|
},
|
|
{
|
|
"entropy": 7.7300177097320555,
|
|
"epoch": 0.24294670846394983,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00015450000000000001,
|
|
"loss": 7.3247,
|
|
"mean_token_accuracy": 0.07261879369616508,
|
|
"num_tokens": 540143.0,
|
|
"step": 310
|
|
},
|
|
{
|
|
"entropy": 7.773348236083985,
|
|
"epoch": 0.2468652037617555,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000157,
|
|
"loss": 7.4017,
|
|
"mean_token_accuracy": 0.07950926274061203,
|
|
"num_tokens": 549156.0,
|
|
"step": 315
|
|
},
|
|
{
|
|
"entropy": 7.7285737037658695,
|
|
"epoch": 0.2507836990595611,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0001595,
|
|
"loss": 7.2557,
|
|
"mean_token_accuracy": 0.07481630519032478,
|
|
"num_tokens": 557522.0,
|
|
"step": 320
|
|
},
|
|
{
|
|
"entropy": 7.654256391525268,
|
|
"epoch": 0.2547021943573668,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000162,
|
|
"loss": 7.331,
|
|
"mean_token_accuracy": 0.07940150126814842,
|
|
"num_tokens": 566650.0,
|
|
"step": 325
|
|
},
|
|
{
|
|
"entropy": 7.672131299972534,
|
|
"epoch": 0.25862068965517243,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00016450000000000001,
|
|
"loss": 7.2864,
|
|
"mean_token_accuracy": 0.07869702018797398,
|
|
"num_tokens": 576219.0,
|
|
"step": 330
|
|
},
|
|
{
|
|
"entropy": 7.713160848617553,
|
|
"epoch": 0.26253918495297807,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.00016700000000000002,
|
|
"loss": 7.374,
|
|
"mean_token_accuracy": 0.07567794360220433,
|
|
"num_tokens": 584304.0,
|
|
"step": 335
|
|
},
|
|
{
|
|
"entropy": 7.634349060058594,
|
|
"epoch": 0.2664576802507837,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00016950000000000003,
|
|
"loss": 7.3003,
|
|
"mean_token_accuracy": 0.07832697704434395,
|
|
"num_tokens": 593163.0,
|
|
"step": 340
|
|
},
|
|
{
|
|
"entropy": 7.6303457736969,
|
|
"epoch": 0.27037617554858934,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00017199999999999998,
|
|
"loss": 7.2164,
|
|
"mean_token_accuracy": 0.07754571028053761,
|
|
"num_tokens": 602077.0,
|
|
"step": 345
|
|
},
|
|
{
|
|
"entropy": 7.6355628490448,
|
|
"epoch": 0.274294670846395,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00017449999999999999,
|
|
"loss": 7.3284,
|
|
"mean_token_accuracy": 0.08122679404914379,
|
|
"num_tokens": 610009.0,
|
|
"step": 350
|
|
},
|
|
{
|
|
"entropy": 7.685596513748169,
|
|
"epoch": 0.2782131661442006,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000177,
|
|
"loss": 7.362,
|
|
"mean_token_accuracy": 0.07597106769680977,
|
|
"num_tokens": 619282.0,
|
|
"step": 355
|
|
},
|
|
{
|
|
"entropy": 7.619720935821533,
|
|
"epoch": 0.28213166144200624,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0001795,
|
|
"loss": 7.2893,
|
|
"mean_token_accuracy": 0.08018167689442635,
|
|
"num_tokens": 628138.0,
|
|
"step": 360
|
|
},
|
|
{
|
|
"entropy": 7.6796112060546875,
|
|
"epoch": 0.28605015673981193,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.000182,
|
|
"loss": 7.1745,
|
|
"mean_token_accuracy": 0.08669476807117463,
|
|
"num_tokens": 637021.0,
|
|
"step": 365
|
|
},
|
|
{
|
|
"entropy": 7.619413709640503,
|
|
"epoch": 0.28996865203761757,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0001845,
|
|
"loss": 7.3161,
|
|
"mean_token_accuracy": 0.07477690353989601,
|
|
"num_tokens": 646703.0,
|
|
"step": 370
|
|
},
|
|
{
|
|
"entropy": 7.623689222335815,
|
|
"epoch": 0.2938871473354232,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.000187,
|
|
"loss": 7.1981,
|
|
"mean_token_accuracy": 0.08060777708888053,
|
|
"num_tokens": 655616.0,
|
|
"step": 375
|
|
},
|
|
{
|
|
"entropy": 7.556978368759156,
|
|
"epoch": 0.29780564263322884,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0001895,
|
|
"loss": 7.1994,
|
|
"mean_token_accuracy": 0.08719867020845413,
|
|
"num_tokens": 663783.0,
|
|
"step": 380
|
|
},
|
|
{
|
|
"entropy": 7.5845866203308105,
|
|
"epoch": 0.3017241379310345,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.000192,
|
|
"loss": 7.2094,
|
|
"mean_token_accuracy": 0.08289245739579201,
|
|
"num_tokens": 671855.0,
|
|
"step": 385
|
|
},
|
|
{
|
|
"entropy": 7.527840566635132,
|
|
"epoch": 0.3056426332288401,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0001945,
|
|
"loss": 7.2219,
|
|
"mean_token_accuracy": 0.07747755497694016,
|
|
"num_tokens": 680981.0,
|
|
"step": 390
|
|
},
|
|
{
|
|
"entropy": 7.6136561870574955,
|
|
"epoch": 0.30956112852664575,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00019700000000000002,
|
|
"loss": 7.1491,
|
|
"mean_token_accuracy": 0.08336339518427849,
|
|
"num_tokens": 689294.0,
|
|
"step": 395
|
|
},
|
|
{
|
|
"entropy": 7.469813442230224,
|
|
"epoch": 0.31347962382445144,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00019950000000000002,
|
|
"loss": 7.1035,
|
|
"mean_token_accuracy": 0.08146922513842583,
|
|
"num_tokens": 697703.0,
|
|
"step": 400
|
|
},
|
|
{
|
|
"entropy": 7.550826740264893,
|
|
"epoch": 0.31739811912225707,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.000202,
|
|
"loss": 7.2372,
|
|
"mean_token_accuracy": 0.08058681413531303,
|
|
"num_tokens": 706792.0,
|
|
"step": 405
|
|
},
|
|
{
|
|
"entropy": 7.606830406188965,
|
|
"epoch": 0.3213166144200627,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00020449999999999998,
|
|
"loss": 7.1473,
|
|
"mean_token_accuracy": 0.08346155509352685,
|
|
"num_tokens": 715864.0,
|
|
"step": 410
|
|
},
|
|
{
|
|
"entropy": 7.3859583854675295,
|
|
"epoch": 0.32523510971786834,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.000207,
|
|
"loss": 7.1975,
|
|
"mean_token_accuracy": 0.0853593334555626,
|
|
"num_tokens": 723921.0,
|
|
"step": 415
|
|
},
|
|
{
|
|
"entropy": 7.5406107902526855,
|
|
"epoch": 0.329153605015674,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0002095,
|
|
"loss": 7.2071,
|
|
"mean_token_accuracy": 0.08046000376343727,
|
|
"num_tokens": 732797.0,
|
|
"step": 420
|
|
},
|
|
{
|
|
"entropy": 7.510403490066528,
|
|
"epoch": 0.3330721003134796,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.000212,
|
|
"loss": 7.0654,
|
|
"mean_token_accuracy": 0.0873202033340931,
|
|
"num_tokens": 741248.0,
|
|
"step": 425
|
|
},
|
|
{
|
|
"entropy": 7.501159954071045,
|
|
"epoch": 0.33699059561128525,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0002145,
|
|
"loss": 7.1615,
|
|
"mean_token_accuracy": 0.08190247714519501,
|
|
"num_tokens": 749766.0,
|
|
"step": 430
|
|
},
|
|
{
|
|
"entropy": 7.408373832702637,
|
|
"epoch": 0.3409090909090909,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00021700000000000002,
|
|
"loss": 7.1268,
|
|
"mean_token_accuracy": 0.08113668784499169,
|
|
"num_tokens": 758695.0,
|
|
"step": 435
|
|
},
|
|
{
|
|
"entropy": 7.44956521987915,
|
|
"epoch": 0.3448275862068966,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0002195,
|
|
"loss": 7.1248,
|
|
"mean_token_accuracy": 0.08192591443657875,
|
|
"num_tokens": 767624.0,
|
|
"step": 440
|
|
},
|
|
{
|
|
"entropy": 7.422909212112427,
|
|
"epoch": 0.3487460815047022,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000222,
|
|
"loss": 7.117,
|
|
"mean_token_accuracy": 0.0853099413216114,
|
|
"num_tokens": 776616.0,
|
|
"step": 445
|
|
},
|
|
{
|
|
"entropy": 7.365292644500732,
|
|
"epoch": 0.35266457680250785,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0002245,
|
|
"loss": 7.1317,
|
|
"mean_token_accuracy": 0.08413158729672432,
|
|
"num_tokens": 786147.0,
|
|
"step": 450
|
|
},
|
|
{
|
|
"entropy": 7.536469745635986,
|
|
"epoch": 0.3565830721003135,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00022700000000000002,
|
|
"loss": 7.2317,
|
|
"mean_token_accuracy": 0.08228531405329705,
|
|
"num_tokens": 795213.0,
|
|
"step": 455
|
|
},
|
|
{
|
|
"entropy": 7.461417722702026,
|
|
"epoch": 0.3605015673981191,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00022950000000000002,
|
|
"loss": 7.0349,
|
|
"mean_token_accuracy": 0.09094136133790016,
|
|
"num_tokens": 803118.0,
|
|
"step": 460
|
|
},
|
|
{
|
|
"entropy": 7.444038438796997,
|
|
"epoch": 0.36442006269592475,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00023200000000000003,
|
|
"loss": 7.0219,
|
|
"mean_token_accuracy": 0.09442275986075402,
|
|
"num_tokens": 811358.0,
|
|
"step": 465
|
|
},
|
|
{
|
|
"entropy": 7.324700260162354,
|
|
"epoch": 0.3683385579937304,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.00023449999999999998,
|
|
"loss": 7.0256,
|
|
"mean_token_accuracy": 0.08778790757060051,
|
|
"num_tokens": 819653.0,
|
|
"step": 470
|
|
},
|
|
{
|
|
"entropy": 7.2960240840911865,
|
|
"epoch": 0.3722570532915361,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.000237,
|
|
"loss": 7.0511,
|
|
"mean_token_accuracy": 0.08624262139201164,
|
|
"num_tokens": 828462.0,
|
|
"step": 475
|
|
},
|
|
{
|
|
"entropy": 7.437795686721802,
|
|
"epoch": 0.3761755485893417,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0002395,
|
|
"loss": 7.1429,
|
|
"mean_token_accuracy": 0.0912679947912693,
|
|
"num_tokens": 836204.0,
|
|
"step": 480
|
|
},
|
|
{
|
|
"entropy": 7.2959794998168945,
|
|
"epoch": 0.38009404388714735,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.000242,
|
|
"loss": 7.0169,
|
|
"mean_token_accuracy": 0.09246607050299645,
|
|
"num_tokens": 845032.0,
|
|
"step": 485
|
|
},
|
|
{
|
|
"entropy": 7.4119359970092775,
|
|
"epoch": 0.384012539184953,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0002445,
|
|
"loss": 7.0308,
|
|
"mean_token_accuracy": 0.08805579245090485,
|
|
"num_tokens": 853324.0,
|
|
"step": 490
|
|
},
|
|
{
|
|
"entropy": 7.404975366592407,
|
|
"epoch": 0.3879310344827586,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.000247,
|
|
"loss": 6.992,
|
|
"mean_token_accuracy": 0.1035026639699936,
|
|
"num_tokens": 861640.0,
|
|
"step": 495
|
|
},
|
|
{
|
|
"entropy": 7.385119247436523,
|
|
"epoch": 0.39184952978056425,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0002495,
|
|
"loss": 7.1744,
|
|
"mean_token_accuracy": 0.082430200278759,
|
|
"num_tokens": 870758.0,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.39184952978056425,
|
|
"eval_entropy": 7.167502074278603,
|
|
"eval_loss": 7.156619548797607,
|
|
"eval_mean_token_accuracy": 0.08891707594152684,
|
|
"eval_num_tokens": 870758.0,
|
|
"eval_runtime": 2.8546,
|
|
"eval_samples_per_second": 1444.004,
|
|
"eval_steps_per_second": 180.763,
|
|
"step": 500
|
|
},
|
|
{
|
|
"entropy": 7.3540655136108395,
|
|
"epoch": 0.3957680250783699,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.000252,
|
|
"loss": 7.0906,
|
|
"mean_token_accuracy": 0.0840302512049675,
|
|
"num_tokens": 879968.0,
|
|
"step": 505
|
|
},
|
|
{
|
|
"entropy": 7.318256664276123,
|
|
"epoch": 0.3996865203761755,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0002545,
|
|
"loss": 7.0471,
|
|
"mean_token_accuracy": 0.08631112575531005,
|
|
"num_tokens": 888804.0,
|
|
"step": 510
|
|
},
|
|
{
|
|
"entropy": 7.259663057327271,
|
|
"epoch": 0.4036050156739812,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.000257,
|
|
"loss": 7.0624,
|
|
"mean_token_accuracy": 0.08012468516826629,
|
|
"num_tokens": 898593.0,
|
|
"step": 515
|
|
},
|
|
{
|
|
"entropy": 7.2833295345306395,
|
|
"epoch": 0.40752351097178685,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0002595,
|
|
"loss": 7.0799,
|
|
"mean_token_accuracy": 0.08342506065964699,
|
|
"num_tokens": 906873.0,
|
|
"step": 520
|
|
},
|
|
{
|
|
"entropy": 7.3285657405853275,
|
|
"epoch": 0.4114420062695925,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000262,
|
|
"loss": 7.0359,
|
|
"mean_token_accuracy": 0.0934828281402588,
|
|
"num_tokens": 915080.0,
|
|
"step": 525
|
|
},
|
|
{
|
|
"entropy": 7.422465038299561,
|
|
"epoch": 0.4153605015673981,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00026450000000000003,
|
|
"loss": 7.0396,
|
|
"mean_token_accuracy": 0.08701496720314025,
|
|
"num_tokens": 923741.0,
|
|
"step": 530
|
|
},
|
|
{
|
|
"entropy": 7.216967153549194,
|
|
"epoch": 0.41927899686520376,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00026700000000000004,
|
|
"loss": 6.9459,
|
|
"mean_token_accuracy": 0.0904500350356102,
|
|
"num_tokens": 932455.0,
|
|
"step": 535
|
|
},
|
|
{
|
|
"entropy": 7.235203742980957,
|
|
"epoch": 0.4231974921630094,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00026950000000000005,
|
|
"loss": 6.9492,
|
|
"mean_token_accuracy": 0.09165697544813156,
|
|
"num_tokens": 941064.0,
|
|
"step": 540
|
|
},
|
|
{
|
|
"entropy": 7.23210301399231,
|
|
"epoch": 0.427115987460815,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00027200000000000005,
|
|
"loss": 6.9795,
|
|
"mean_token_accuracy": 0.0916426420211792,
|
|
"num_tokens": 950261.0,
|
|
"step": 545
|
|
},
|
|
{
|
|
"entropy": 7.305574369430542,
|
|
"epoch": 0.43103448275862066,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0002745,
|
|
"loss": 7.0461,
|
|
"mean_token_accuracy": 0.0905070275068283,
|
|
"num_tokens": 959151.0,
|
|
"step": 550
|
|
},
|
|
{
|
|
"entropy": 7.299721527099609,
|
|
"epoch": 0.43495297805642635,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000277,
|
|
"loss": 7.0677,
|
|
"mean_token_accuracy": 0.09062978066504002,
|
|
"num_tokens": 968441.0,
|
|
"step": 555
|
|
},
|
|
{
|
|
"entropy": 7.134230327606201,
|
|
"epoch": 0.438871473354232,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0002795,
|
|
"loss": 6.9371,
|
|
"mean_token_accuracy": 0.09018276557326317,
|
|
"num_tokens": 977058.0,
|
|
"step": 560
|
|
},
|
|
{
|
|
"entropy": 7.334470558166504,
|
|
"epoch": 0.4427899686520376,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00028199999999999997,
|
|
"loss": 6.9519,
|
|
"mean_token_accuracy": 0.08950636759400368,
|
|
"num_tokens": 986531.0,
|
|
"step": 565
|
|
},
|
|
{
|
|
"entropy": 7.123916816711426,
|
|
"epoch": 0.44670846394984326,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0002845,
|
|
"loss": 6.8108,
|
|
"mean_token_accuracy": 0.09824811816215515,
|
|
"num_tokens": 995081.0,
|
|
"step": 570
|
|
},
|
|
{
|
|
"entropy": 7.103513240814209,
|
|
"epoch": 0.4506269592476489,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.000287,
|
|
"loss": 6.8802,
|
|
"mean_token_accuracy": 0.09345417022705078,
|
|
"num_tokens": 1003459.0,
|
|
"step": 575
|
|
},
|
|
{
|
|
"entropy": 7.101778936386109,
|
|
"epoch": 0.45454545454545453,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0002895,
|
|
"loss": 6.9239,
|
|
"mean_token_accuracy": 0.09018066227436065,
|
|
"num_tokens": 1012420.0,
|
|
"step": 580
|
|
},
|
|
{
|
|
"entropy": 7.261321687698365,
|
|
"epoch": 0.45846394984326017,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.000292,
|
|
"loss": 6.9954,
|
|
"mean_token_accuracy": 0.09160361662507058,
|
|
"num_tokens": 1021198.0,
|
|
"step": 585
|
|
},
|
|
{
|
|
"entropy": 7.202180290222168,
|
|
"epoch": 0.46238244514106586,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0002945,
|
|
"loss": 6.9694,
|
|
"mean_token_accuracy": 0.09484207406640052,
|
|
"num_tokens": 1030023.0,
|
|
"step": 590
|
|
},
|
|
{
|
|
"entropy": 7.164184045791626,
|
|
"epoch": 0.4663009404388715,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.000297,
|
|
"loss": 6.8295,
|
|
"mean_token_accuracy": 0.0953464850783348,
|
|
"num_tokens": 1038824.0,
|
|
"step": 595
|
|
},
|
|
{
|
|
"entropy": 7.216883039474487,
|
|
"epoch": 0.4702194357366771,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0002995,
|
|
"loss": 7.0882,
|
|
"mean_token_accuracy": 0.09343515783548355,
|
|
"num_tokens": 1048207.0,
|
|
"step": 600
|
|
},
|
|
{
|
|
"entropy": 7.304975414276123,
|
|
"epoch": 0.47413793103448276,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000302,
|
|
"loss": 7.0559,
|
|
"mean_token_accuracy": 0.08962106555700303,
|
|
"num_tokens": 1057354.0,
|
|
"step": 605
|
|
},
|
|
{
|
|
"entropy": 7.198044776916504,
|
|
"epoch": 0.4780564263322884,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0003045,
|
|
"loss": 6.9893,
|
|
"mean_token_accuracy": 0.09330410435795784,
|
|
"num_tokens": 1066825.0,
|
|
"step": 610
|
|
},
|
|
{
|
|
"entropy": 7.137618494033814,
|
|
"epoch": 0.48197492163009403,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.000307,
|
|
"loss": 6.9155,
|
|
"mean_token_accuracy": 0.08996602892875671,
|
|
"num_tokens": 1075752.0,
|
|
"step": 615
|
|
},
|
|
{
|
|
"entropy": 7.007559776306152,
|
|
"epoch": 0.48589341692789967,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0003095,
|
|
"loss": 6.8323,
|
|
"mean_token_accuracy": 0.09399376884102821,
|
|
"num_tokens": 1084826.0,
|
|
"step": 620
|
|
},
|
|
{
|
|
"entropy": 7.223559427261352,
|
|
"epoch": 0.4898119122257053,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.000312,
|
|
"loss": 6.9666,
|
|
"mean_token_accuracy": 0.09359999522566795,
|
|
"num_tokens": 1093425.0,
|
|
"step": 625
|
|
},
|
|
{
|
|
"entropy": 7.108446407318115,
|
|
"epoch": 0.493730407523511,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0003145,
|
|
"loss": 6.9828,
|
|
"mean_token_accuracy": 0.089838757365942,
|
|
"num_tokens": 1102619.0,
|
|
"step": 630
|
|
},
|
|
{
|
|
"entropy": 7.145089435577392,
|
|
"epoch": 0.49764890282131663,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000317,
|
|
"loss": 6.8601,
|
|
"mean_token_accuracy": 0.09595804288983345,
|
|
"num_tokens": 1111337.0,
|
|
"step": 635
|
|
},
|
|
{
|
|
"entropy": 7.005008172988892,
|
|
"epoch": 0.5015673981191222,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0003195,
|
|
"loss": 6.7035,
|
|
"mean_token_accuracy": 0.0990886114537716,
|
|
"num_tokens": 1119381.0,
|
|
"step": 640
|
|
},
|
|
{
|
|
"entropy": 6.96934700012207,
|
|
"epoch": 0.5054858934169278,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.000322,
|
|
"loss": 6.7597,
|
|
"mean_token_accuracy": 0.09665322229266167,
|
|
"num_tokens": 1127825.0,
|
|
"step": 645
|
|
},
|
|
{
|
|
"entropy": 7.11965799331665,
|
|
"epoch": 0.5094043887147336,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00032450000000000003,
|
|
"loss": 6.8203,
|
|
"mean_token_accuracy": 0.09057366773486138,
|
|
"num_tokens": 1136750.0,
|
|
"step": 650
|
|
},
|
|
{
|
|
"entropy": 7.017684459686279,
|
|
"epoch": 0.5133228840125392,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00032700000000000003,
|
|
"loss": 6.8499,
|
|
"mean_token_accuracy": 0.09456580057740212,
|
|
"num_tokens": 1145739.0,
|
|
"step": 655
|
|
},
|
|
{
|
|
"entropy": 7.034306764602661,
|
|
"epoch": 0.5172413793103449,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00032950000000000004,
|
|
"loss": 6.8973,
|
|
"mean_token_accuracy": 0.09757498279213905,
|
|
"num_tokens": 1154415.0,
|
|
"step": 660
|
|
},
|
|
{
|
|
"entropy": 7.052440977096557,
|
|
"epoch": 0.5211598746081505,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00033200000000000005,
|
|
"loss": 6.7751,
|
|
"mean_token_accuracy": 0.09611302688717842,
|
|
"num_tokens": 1162480.0,
|
|
"step": 665
|
|
},
|
|
{
|
|
"entropy": 6.977913856506348,
|
|
"epoch": 0.5250783699059561,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00033450000000000005,
|
|
"loss": 6.8559,
|
|
"mean_token_accuracy": 0.09223495721817017,
|
|
"num_tokens": 1170774.0,
|
|
"step": 670
|
|
},
|
|
{
|
|
"entropy": 7.030881881713867,
|
|
"epoch": 0.5289968652037618,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000337,
|
|
"loss": 6.7924,
|
|
"mean_token_accuracy": 0.09683787003159523,
|
|
"num_tokens": 1179629.0,
|
|
"step": 675
|
|
},
|
|
{
|
|
"entropy": 7.053485012054443,
|
|
"epoch": 0.5329153605015674,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0003395,
|
|
"loss": 6.938,
|
|
"mean_token_accuracy": 0.09229604452848435,
|
|
"num_tokens": 1189111.0,
|
|
"step": 680
|
|
},
|
|
{
|
|
"entropy": 7.108199834823608,
|
|
"epoch": 0.536833855799373,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.000342,
|
|
"loss": 6.9495,
|
|
"mean_token_accuracy": 0.09128881692886352,
|
|
"num_tokens": 1198827.0,
|
|
"step": 685
|
|
},
|
|
{
|
|
"entropy": 7.092297840118408,
|
|
"epoch": 0.5407523510971787,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00034449999999999997,
|
|
"loss": 6.8482,
|
|
"mean_token_accuracy": 0.09970205947756768,
|
|
"num_tokens": 1207089.0,
|
|
"step": 690
|
|
},
|
|
{
|
|
"entropy": 7.057426071166992,
|
|
"epoch": 0.5446708463949843,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.000347,
|
|
"loss": 6.8435,
|
|
"mean_token_accuracy": 0.08852889537811279,
|
|
"num_tokens": 1216509.0,
|
|
"step": 695
|
|
},
|
|
{
|
|
"entropy": 6.900876426696778,
|
|
"epoch": 0.54858934169279,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003495,
|
|
"loss": 6.6756,
|
|
"mean_token_accuracy": 0.10320580378174782,
|
|
"num_tokens": 1225684.0,
|
|
"step": 700
|
|
},
|
|
{
|
|
"entropy": 7.005167055130005,
|
|
"epoch": 0.5525078369905956,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.000352,
|
|
"loss": 6.7057,
|
|
"mean_token_accuracy": 0.10480915755033493,
|
|
"num_tokens": 1233675.0,
|
|
"step": 705
|
|
},
|
|
{
|
|
"entropy": 6.892497873306274,
|
|
"epoch": 0.5564263322884012,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0003545,
|
|
"loss": 6.7091,
|
|
"mean_token_accuracy": 0.10067695155739784,
|
|
"num_tokens": 1242147.0,
|
|
"step": 710
|
|
},
|
|
{
|
|
"entropy": 6.934285736083984,
|
|
"epoch": 0.5603448275862069,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000357,
|
|
"loss": 6.8249,
|
|
"mean_token_accuracy": 0.09867035746574401,
|
|
"num_tokens": 1251127.0,
|
|
"step": 715
|
|
},
|
|
{
|
|
"entropy": 6.978139781951905,
|
|
"epoch": 0.5642633228840125,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003595,
|
|
"loss": 6.7775,
|
|
"mean_token_accuracy": 0.09740801975131035,
|
|
"num_tokens": 1260589.0,
|
|
"step": 720
|
|
},
|
|
{
|
|
"entropy": 7.05702314376831,
|
|
"epoch": 0.5681818181818182,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.000362,
|
|
"loss": 6.8522,
|
|
"mean_token_accuracy": 0.09754758477210998,
|
|
"num_tokens": 1269281.0,
|
|
"step": 725
|
|
},
|
|
{
|
|
"entropy": 6.951777076721191,
|
|
"epoch": 0.5721003134796239,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003645,
|
|
"loss": 6.7117,
|
|
"mean_token_accuracy": 0.09818840324878693,
|
|
"num_tokens": 1278033.0,
|
|
"step": 730
|
|
},
|
|
{
|
|
"entropy": 7.001934242248535,
|
|
"epoch": 0.5760188087774295,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.000367,
|
|
"loss": 6.7808,
|
|
"mean_token_accuracy": 0.10354246944189072,
|
|
"num_tokens": 1285847.0,
|
|
"step": 735
|
|
},
|
|
{
|
|
"entropy": 6.932463216781616,
|
|
"epoch": 0.5799373040752351,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0003695,
|
|
"loss": 6.7726,
|
|
"mean_token_accuracy": 0.09513568431138993,
|
|
"num_tokens": 1294770.0,
|
|
"step": 740
|
|
},
|
|
{
|
|
"entropy": 7.036710739135742,
|
|
"epoch": 0.5838557993730408,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.000372,
|
|
"loss": 6.9512,
|
|
"mean_token_accuracy": 0.0940048098564148,
|
|
"num_tokens": 1304282.0,
|
|
"step": 745
|
|
},
|
|
{
|
|
"entropy": 6.997774505615235,
|
|
"epoch": 0.5877742946708464,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0003745,
|
|
"loss": 6.7923,
|
|
"mean_token_accuracy": 0.09595935121178627,
|
|
"num_tokens": 1313218.0,
|
|
"step": 750
|
|
},
|
|
{
|
|
"entropy": 6.966546869277954,
|
|
"epoch": 0.591692789968652,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000377,
|
|
"loss": 6.754,
|
|
"mean_token_accuracy": 0.0985157236456871,
|
|
"num_tokens": 1322093.0,
|
|
"step": 755
|
|
},
|
|
{
|
|
"entropy": 6.83478045463562,
|
|
"epoch": 0.5956112852664577,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0003795,
|
|
"loss": 6.7857,
|
|
"mean_token_accuracy": 0.09971616193652152,
|
|
"num_tokens": 1330718.0,
|
|
"step": 760
|
|
},
|
|
{
|
|
"entropy": 6.954968690872192,
|
|
"epoch": 0.5995297805642633,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.000382,
|
|
"loss": 6.6457,
|
|
"mean_token_accuracy": 0.10431547313928605,
|
|
"num_tokens": 1339672.0,
|
|
"step": 765
|
|
},
|
|
{
|
|
"entropy": 6.861670970916748,
|
|
"epoch": 0.603448275862069,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0003845,
|
|
"loss": 6.7001,
|
|
"mean_token_accuracy": 0.10293109342455864,
|
|
"num_tokens": 1348587.0,
|
|
"step": 770
|
|
},
|
|
{
|
|
"entropy": 6.8885609149932865,
|
|
"epoch": 0.6073667711598746,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00038700000000000003,
|
|
"loss": 6.6826,
|
|
"mean_token_accuracy": 0.10689334198832512,
|
|
"num_tokens": 1357597.0,
|
|
"step": 775
|
|
},
|
|
{
|
|
"entropy": 6.883357572555542,
|
|
"epoch": 0.6112852664576802,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00038950000000000003,
|
|
"loss": 6.7744,
|
|
"mean_token_accuracy": 0.101459039747715,
|
|
"num_tokens": 1366246.0,
|
|
"step": 780
|
|
},
|
|
{
|
|
"entropy": 6.89829683303833,
|
|
"epoch": 0.6152037617554859,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00039200000000000004,
|
|
"loss": 6.6695,
|
|
"mean_token_accuracy": 0.10532263070344924,
|
|
"num_tokens": 1374664.0,
|
|
"step": 785
|
|
},
|
|
{
|
|
"entropy": 6.8325498580932615,
|
|
"epoch": 0.6191222570532915,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00039450000000000005,
|
|
"loss": 6.6889,
|
|
"mean_token_accuracy": 0.10108358785510063,
|
|
"num_tokens": 1382765.0,
|
|
"step": 790
|
|
},
|
|
{
|
|
"entropy": 6.7815876483917235,
|
|
"epoch": 0.6230407523510971,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00039700000000000005,
|
|
"loss": 6.6442,
|
|
"mean_token_accuracy": 0.10871021300554276,
|
|
"num_tokens": 1391708.0,
|
|
"step": 795
|
|
},
|
|
{
|
|
"entropy": 6.786308908462525,
|
|
"epoch": 0.6269592476489029,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0003995,
|
|
"loss": 6.814,
|
|
"mean_token_accuracy": 0.09444232732057571,
|
|
"num_tokens": 1401373.0,
|
|
"step": 800
|
|
},
|
|
{
|
|
"entropy": 6.926651859283448,
|
|
"epoch": 0.6308777429467085,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000402,
|
|
"loss": 6.6124,
|
|
"mean_token_accuracy": 0.10876154825091362,
|
|
"num_tokens": 1409702.0,
|
|
"step": 805
|
|
},
|
|
{
|
|
"entropy": 6.87204418182373,
|
|
"epoch": 0.6347962382445141,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004045,
|
|
"loss": 6.8084,
|
|
"mean_token_accuracy": 0.10008606985211373,
|
|
"num_tokens": 1419596.0,
|
|
"step": 810
|
|
},
|
|
{
|
|
"entropy": 6.807423734664917,
|
|
"epoch": 0.6387147335423198,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00040699999999999997,
|
|
"loss": 6.7461,
|
|
"mean_token_accuracy": 0.10076582729816437,
|
|
"num_tokens": 1428437.0,
|
|
"step": 815
|
|
},
|
|
{
|
|
"entropy": 6.9204614639282225,
|
|
"epoch": 0.6426332288401254,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004095,
|
|
"loss": 6.6547,
|
|
"mean_token_accuracy": 0.10060450211167335,
|
|
"num_tokens": 1436764.0,
|
|
"step": 820
|
|
},
|
|
{
|
|
"entropy": 6.812792015075684,
|
|
"epoch": 0.646551724137931,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.000412,
|
|
"loss": 6.7007,
|
|
"mean_token_accuracy": 0.10875345095992088,
|
|
"num_tokens": 1445755.0,
|
|
"step": 825
|
|
},
|
|
{
|
|
"entropy": 6.902109909057617,
|
|
"epoch": 0.6504702194357367,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004145,
|
|
"loss": 6.7627,
|
|
"mean_token_accuracy": 0.10212339907884598,
|
|
"num_tokens": 1454550.0,
|
|
"step": 830
|
|
},
|
|
{
|
|
"entropy": 6.757165718078613,
|
|
"epoch": 0.6543887147335423,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.000417,
|
|
"loss": 6.6529,
|
|
"mean_token_accuracy": 0.1076908752322197,
|
|
"num_tokens": 1463039.0,
|
|
"step": 835
|
|
},
|
|
{
|
|
"entropy": 6.920673799514771,
|
|
"epoch": 0.658307210031348,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004195,
|
|
"loss": 6.7125,
|
|
"mean_token_accuracy": 0.10059169679880142,
|
|
"num_tokens": 1472029.0,
|
|
"step": 840
|
|
},
|
|
{
|
|
"entropy": 6.742834091186523,
|
|
"epoch": 0.6622257053291536,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000422,
|
|
"loss": 6.723,
|
|
"mean_token_accuracy": 0.10695556625723839,
|
|
"num_tokens": 1481588.0,
|
|
"step": 845
|
|
},
|
|
{
|
|
"entropy": 6.7755883693695065,
|
|
"epoch": 0.6661442006269592,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004245,
|
|
"loss": 6.6131,
|
|
"mean_token_accuracy": 0.1035246841609478,
|
|
"num_tokens": 1489915.0,
|
|
"step": 850
|
|
},
|
|
{
|
|
"entropy": 6.87349271774292,
|
|
"epoch": 0.6700626959247649,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000427,
|
|
"loss": 6.7211,
|
|
"mean_token_accuracy": 0.10070185288786888,
|
|
"num_tokens": 1497920.0,
|
|
"step": 855
|
|
},
|
|
{
|
|
"entropy": 6.742019605636597,
|
|
"epoch": 0.6739811912225705,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004295,
|
|
"loss": 6.5571,
|
|
"mean_token_accuracy": 0.10514650270342826,
|
|
"num_tokens": 1506965.0,
|
|
"step": 860
|
|
},
|
|
{
|
|
"entropy": 6.755401372909546,
|
|
"epoch": 0.6778996865203761,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.000432,
|
|
"loss": 6.6626,
|
|
"mean_token_accuracy": 0.10886923670768738,
|
|
"num_tokens": 1516028.0,
|
|
"step": 865
|
|
},
|
|
{
|
|
"entropy": 6.824488735198974,
|
|
"epoch": 0.6818181818181818,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004345,
|
|
"loss": 6.6802,
|
|
"mean_token_accuracy": 0.10832962691783905,
|
|
"num_tokens": 1524579.0,
|
|
"step": 870
|
|
},
|
|
{
|
|
"entropy": 6.729337167739868,
|
|
"epoch": 0.6857366771159875,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000437,
|
|
"loss": 6.605,
|
|
"mean_token_accuracy": 0.1062053769826889,
|
|
"num_tokens": 1534285.0,
|
|
"step": 875
|
|
},
|
|
{
|
|
"entropy": 6.7189655780792235,
|
|
"epoch": 0.6896551724137931,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004395,
|
|
"loss": 6.5898,
|
|
"mean_token_accuracy": 0.10706395953893662,
|
|
"num_tokens": 1543701.0,
|
|
"step": 880
|
|
},
|
|
{
|
|
"entropy": 6.891758966445923,
|
|
"epoch": 0.6935736677115988,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.000442,
|
|
"loss": 6.8576,
|
|
"mean_token_accuracy": 0.09462928622961045,
|
|
"num_tokens": 1552715.0,
|
|
"step": 885
|
|
},
|
|
{
|
|
"entropy": 6.6106942653656,
|
|
"epoch": 0.6974921630094044,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004445,
|
|
"loss": 6.4728,
|
|
"mean_token_accuracy": 0.1057778999209404,
|
|
"num_tokens": 1561523.0,
|
|
"step": 890
|
|
},
|
|
{
|
|
"entropy": 6.685323572158813,
|
|
"epoch": 0.70141065830721,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.000447,
|
|
"loss": 6.6995,
|
|
"mean_token_accuracy": 0.10680059865117073,
|
|
"num_tokens": 1570057.0,
|
|
"step": 895
|
|
},
|
|
{
|
|
"entropy": 6.810699081420898,
|
|
"epoch": 0.7053291536050157,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00044950000000000003,
|
|
"loss": 6.5487,
|
|
"mean_token_accuracy": 0.10731169655919075,
|
|
"num_tokens": 1578795.0,
|
|
"step": 900
|
|
},
|
|
{
|
|
"entropy": 6.618034505844117,
|
|
"epoch": 0.7092476489028213,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00045200000000000004,
|
|
"loss": 6.627,
|
|
"mean_token_accuracy": 0.10445040464401245,
|
|
"num_tokens": 1588108.0,
|
|
"step": 905
|
|
},
|
|
{
|
|
"entropy": 6.834760522842407,
|
|
"epoch": 0.713166144200627,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00045450000000000004,
|
|
"loss": 6.5897,
|
|
"mean_token_accuracy": 0.10842868015170097,
|
|
"num_tokens": 1596545.0,
|
|
"step": 910
|
|
},
|
|
{
|
|
"entropy": 6.617375135421753,
|
|
"epoch": 0.7170846394984326,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00045700000000000005,
|
|
"loss": 6.5774,
|
|
"mean_token_accuracy": 0.11101481318473816,
|
|
"num_tokens": 1605098.0,
|
|
"step": 915
|
|
},
|
|
{
|
|
"entropy": 6.7286604881286625,
|
|
"epoch": 0.7210031347962382,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00045950000000000006,
|
|
"loss": 6.5928,
|
|
"mean_token_accuracy": 0.11244359910488129,
|
|
"num_tokens": 1613760.0,
|
|
"step": 920
|
|
},
|
|
{
|
|
"entropy": 6.685993957519531,
|
|
"epoch": 0.7249216300940439,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.000462,
|
|
"loss": 6.6311,
|
|
"mean_token_accuracy": 0.10528192594647408,
|
|
"num_tokens": 1622345.0,
|
|
"step": 925
|
|
},
|
|
{
|
|
"entropy": 6.825484371185302,
|
|
"epoch": 0.7288401253918495,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004645,
|
|
"loss": 6.6198,
|
|
"mean_token_accuracy": 0.10753775164484977,
|
|
"num_tokens": 1630706.0,
|
|
"step": 930
|
|
},
|
|
{
|
|
"entropy": 6.736838388442993,
|
|
"epoch": 0.7327586206896551,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.000467,
|
|
"loss": 6.6851,
|
|
"mean_token_accuracy": 0.10634701699018478,
|
|
"num_tokens": 1639880.0,
|
|
"step": 935
|
|
},
|
|
{
|
|
"entropy": 6.709425592422486,
|
|
"epoch": 0.7366771159874608,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004695,
|
|
"loss": 6.5189,
|
|
"mean_token_accuracy": 0.10119672417640686,
|
|
"num_tokens": 1648325.0,
|
|
"step": 940
|
|
},
|
|
{
|
|
"entropy": 6.547716999053955,
|
|
"epoch": 0.7405956112852664,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.000472,
|
|
"loss": 6.4928,
|
|
"mean_token_accuracy": 0.11216101795434952,
|
|
"num_tokens": 1656951.0,
|
|
"step": 945
|
|
},
|
|
{
|
|
"entropy": 6.6757955074310305,
|
|
"epoch": 0.7445141065830722,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004745,
|
|
"loss": 6.6264,
|
|
"mean_token_accuracy": 0.1065959431231022,
|
|
"num_tokens": 1665832.0,
|
|
"step": 950
|
|
},
|
|
{
|
|
"entropy": 6.653597450256347,
|
|
"epoch": 0.7484326018808778,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.000477,
|
|
"loss": 6.5458,
|
|
"mean_token_accuracy": 0.10681739151477813,
|
|
"num_tokens": 1675116.0,
|
|
"step": 955
|
|
},
|
|
{
|
|
"entropy": 6.55257477760315,
|
|
"epoch": 0.7523510971786834,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004795,
|
|
"loss": 6.5317,
|
|
"mean_token_accuracy": 0.11514699757099152,
|
|
"num_tokens": 1684195.0,
|
|
"step": 960
|
|
},
|
|
{
|
|
"entropy": 6.689774370193481,
|
|
"epoch": 0.7562695924764891,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000482,
|
|
"loss": 6.4895,
|
|
"mean_token_accuracy": 0.11039396822452545,
|
|
"num_tokens": 1693025.0,
|
|
"step": 965
|
|
},
|
|
{
|
|
"entropy": 6.481554937362671,
|
|
"epoch": 0.7601880877742947,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004845,
|
|
"loss": 6.4836,
|
|
"mean_token_accuracy": 0.10764000788331032,
|
|
"num_tokens": 1701210.0,
|
|
"step": 970
|
|
},
|
|
{
|
|
"entropy": 6.6965916633605955,
|
|
"epoch": 0.7641065830721003,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000487,
|
|
"loss": 6.5847,
|
|
"mean_token_accuracy": 0.10125251486897469,
|
|
"num_tokens": 1711026.0,
|
|
"step": 975
|
|
},
|
|
{
|
|
"entropy": 6.65680046081543,
|
|
"epoch": 0.768025078369906,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004895,
|
|
"loss": 6.5874,
|
|
"mean_token_accuracy": 0.10824618190526962,
|
|
"num_tokens": 1719441.0,
|
|
"step": 980
|
|
},
|
|
{
|
|
"entropy": 6.599896430969238,
|
|
"epoch": 0.7719435736677116,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000492,
|
|
"loss": 6.5849,
|
|
"mean_token_accuracy": 0.10633337944746017,
|
|
"num_tokens": 1728300.0,
|
|
"step": 985
|
|
},
|
|
{
|
|
"entropy": 6.620725393295288,
|
|
"epoch": 0.7758620689655172,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004945,
|
|
"loss": 6.4769,
|
|
"mean_token_accuracy": 0.11418513432145119,
|
|
"num_tokens": 1736364.0,
|
|
"step": 990
|
|
},
|
|
{
|
|
"entropy": 6.466502714157104,
|
|
"epoch": 0.7797805642633229,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000497,
|
|
"loss": 6.3224,
|
|
"mean_token_accuracy": 0.1263233445584774,
|
|
"num_tokens": 1745043.0,
|
|
"step": 995
|
|
},
|
|
{
|
|
"entropy": 6.654029464721679,
|
|
"epoch": 0.7836990595611285,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004995,
|
|
"loss": 6.4667,
|
|
"mean_token_accuracy": 0.11487501338124276,
|
|
"num_tokens": 1754008.0,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 0.7836990595611285,
|
|
"eval_entropy": 6.3669652088668,
|
|
"eval_loss": 6.630230903625488,
|
|
"eval_mean_token_accuracy": 0.11402813254227472,
|
|
"eval_num_tokens": 1754008.0,
|
|
"eval_runtime": 2.8378,
|
|
"eval_samples_per_second": 1452.509,
|
|
"eval_steps_per_second": 181.828,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"entropy": 6.444351482391357,
|
|
"epoch": 0.7876175548589341,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004999998713243189,
|
|
"loss": 6.5346,
|
|
"mean_token_accuracy": 0.11141253411769866,
|
|
"num_tokens": 1763592.0,
|
|
"step": 1005
|
|
},
|
|
{
|
|
"entropy": 6.584927940368653,
|
|
"epoch": 0.7915360501567398,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004999993485796164,
|
|
"loss": 6.403,
|
|
"mean_token_accuracy": 0.1155676744878292,
|
|
"num_tokens": 1773111.0,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"entropy": 6.580813217163086,
|
|
"epoch": 0.7954545454545454,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004999984237245962,
|
|
"loss": 6.4624,
|
|
"mean_token_accuracy": 0.11948382258415222,
|
|
"num_tokens": 1781230.0,
|
|
"step": 1015
|
|
},
|
|
{
|
|
"entropy": 6.740645933151245,
|
|
"epoch": 0.799373040752351,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004999970967609109,
|
|
"loss": 6.6906,
|
|
"mean_token_accuracy": 0.1132524773478508,
|
|
"num_tokens": 1790324.0,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"entropy": 6.513845157623291,
|
|
"epoch": 0.8032915360501567,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004999953676909322,
|
|
"loss": 6.5348,
|
|
"mean_token_accuracy": 0.1044821061193943,
|
|
"num_tokens": 1798849.0,
|
|
"step": 1025
|
|
},
|
|
{
|
|
"entropy": 6.588596630096435,
|
|
"epoch": 0.8072100313479624,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004999932365177502,
|
|
"loss": 6.4097,
|
|
"mean_token_accuracy": 0.11618589833378792,
|
|
"num_tokens": 1808019.0,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"entropy": 6.624749708175659,
|
|
"epoch": 0.8111285266457681,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004999907032451735,
|
|
"loss": 6.5302,
|
|
"mean_token_accuracy": 0.11330178454518318,
|
|
"num_tokens": 1816550.0,
|
|
"step": 1035
|
|
},
|
|
{
|
|
"entropy": 6.507697629928589,
|
|
"epoch": 0.8150470219435737,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004999877678777296,
|
|
"loss": 6.4044,
|
|
"mean_token_accuracy": 0.12338297590613365,
|
|
"num_tokens": 1825238.0,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"entropy": 6.4800478458404545,
|
|
"epoch": 0.8189655172413793,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999844304206645,
|
|
"loss": 6.3892,
|
|
"mean_token_accuracy": 0.11694491282105446,
|
|
"num_tokens": 1833645.0,
|
|
"step": 1045
|
|
},
|
|
{
|
|
"entropy": 6.564522838592529,
|
|
"epoch": 0.822884012539185,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004999806908799428,
|
|
"loss": 6.4637,
|
|
"mean_token_accuracy": 0.11064697802066803,
|
|
"num_tokens": 1842892.0,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"entropy": 6.528485631942749,
|
|
"epoch": 0.8268025078369906,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004999765492622474,
|
|
"loss": 6.5046,
|
|
"mean_token_accuracy": 0.11715267226099968,
|
|
"num_tokens": 1851400.0,
|
|
"step": 1055
|
|
},
|
|
{
|
|
"entropy": 6.596149349212647,
|
|
"epoch": 0.8307210031347962,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999720055749804,
|
|
"loss": 6.4464,
|
|
"mean_token_accuracy": 0.11808413341641426,
|
|
"num_tokens": 1860088.0,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"entropy": 6.574128770828247,
|
|
"epoch": 0.8346394984326019,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004999670598262619,
|
|
"loss": 6.4971,
|
|
"mean_token_accuracy": 0.11734354719519616,
|
|
"num_tokens": 1868558.0,
|
|
"step": 1065
|
|
},
|
|
{
|
|
"entropy": 6.427276229858398,
|
|
"epoch": 0.8385579937304075,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004999617120249308,
|
|
"loss": 6.4897,
|
|
"mean_token_accuracy": 0.11183837950229644,
|
|
"num_tokens": 1877624.0,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"entropy": 6.671892261505127,
|
|
"epoch": 0.8424764890282131,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004999559621805445,
|
|
"loss": 6.5975,
|
|
"mean_token_accuracy": 0.11244333311915397,
|
|
"num_tokens": 1886505.0,
|
|
"step": 1075
|
|
},
|
|
{
|
|
"entropy": 6.452223205566407,
|
|
"epoch": 0.8463949843260188,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004999498103033788,
|
|
"loss": 6.3456,
|
|
"mean_token_accuracy": 0.12142532989382744,
|
|
"num_tokens": 1895701.0,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"entropy": 6.390986537933349,
|
|
"epoch": 0.8503134796238244,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999432564044284,
|
|
"loss": 6.3008,
|
|
"mean_token_accuracy": 0.12294157966971397,
|
|
"num_tokens": 1904640.0,
|
|
"step": 1085
|
|
},
|
|
{
|
|
"entropy": 6.614175415039062,
|
|
"epoch": 0.85423197492163,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004999363004954058,
|
|
"loss": 6.5357,
|
|
"mean_token_accuracy": 0.11326182112097741,
|
|
"num_tokens": 1914216.0,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"entropy": 6.48394021987915,
|
|
"epoch": 0.8581504702194357,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004999289425887425,
|
|
"loss": 6.4573,
|
|
"mean_token_accuracy": 0.11912907212972641,
|
|
"num_tokens": 1923280.0,
|
|
"step": 1095
|
|
},
|
|
{
|
|
"entropy": 6.611083984375,
|
|
"epoch": 0.8620689655172413,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004999211826975884,
|
|
"loss": 6.4764,
|
|
"mean_token_accuracy": 0.11999231576919556,
|
|
"num_tokens": 1931403.0,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"entropy": 6.5545590877532955,
|
|
"epoch": 0.8659874608150471,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999130208358114,
|
|
"loss": 6.4882,
|
|
"mean_token_accuracy": 0.11854184418916702,
|
|
"num_tokens": 1940774.0,
|
|
"step": 1105
|
|
},
|
|
{
|
|
"entropy": 6.561131238937378,
|
|
"epoch": 0.8699059561128527,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004999044570179983,
|
|
"loss": 6.5687,
|
|
"mean_token_accuracy": 0.11162992268800735,
|
|
"num_tokens": 1950045.0,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"entropy": 6.606174612045288,
|
|
"epoch": 0.8738244514106583,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004998954912594541,
|
|
"loss": 6.4405,
|
|
"mean_token_accuracy": 0.11669689863920212,
|
|
"num_tokens": 1959303.0,
|
|
"step": 1115
|
|
},
|
|
{
|
|
"entropy": 6.492517614364624,
|
|
"epoch": 0.877742946708464,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004998861235762018,
|
|
"loss": 6.424,
|
|
"mean_token_accuracy": 0.1151531957089901,
|
|
"num_tokens": 1968583.0,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"entropy": 6.590066766738891,
|
|
"epoch": 0.8816614420062696,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004998763539849832,
|
|
"loss": 6.5317,
|
|
"mean_token_accuracy": 0.11314806714653969,
|
|
"num_tokens": 1977489.0,
|
|
"step": 1125
|
|
},
|
|
{
|
|
"entropy": 6.413249540328979,
|
|
"epoch": 0.8855799373040752,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004998661825032579,
|
|
"loss": 6.306,
|
|
"mean_token_accuracy": 0.1186840571463108,
|
|
"num_tokens": 1985308.0,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"entropy": 6.476549482345581,
|
|
"epoch": 0.8894984326018809,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004998556091492043,
|
|
"loss": 6.4658,
|
|
"mean_token_accuracy": 0.11185984387993812,
|
|
"num_tokens": 1993930.0,
|
|
"step": 1135
|
|
},
|
|
{
|
|
"entropy": 6.523940944671631,
|
|
"epoch": 0.8934169278996865,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004998446339417184,
|
|
"loss": 6.4676,
|
|
"mean_token_accuracy": 0.10710446015000344,
|
|
"num_tokens": 2003802.0,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"entropy": 6.495583724975586,
|
|
"epoch": 0.8973354231974922,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004998332569004149,
|
|
"loss": 6.3809,
|
|
"mean_token_accuracy": 0.11923198625445366,
|
|
"num_tokens": 2011957.0,
|
|
"step": 1145
|
|
},
|
|
{
|
|
"entropy": 6.457823705673218,
|
|
"epoch": 0.9012539184952978,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004998214780456263,
|
|
"loss": 6.3813,
|
|
"mean_token_accuracy": 0.12053523734211921,
|
|
"num_tokens": 2020741.0,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"entropy": 6.47773175239563,
|
|
"epoch": 0.9051724137931034,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004998092973984033,
|
|
"loss": 6.4479,
|
|
"mean_token_accuracy": 0.11438319608569145,
|
|
"num_tokens": 2029804.0,
|
|
"step": 1155
|
|
},
|
|
{
|
|
"entropy": 6.543193197250366,
|
|
"epoch": 0.9090909090909091,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004997967149805147,
|
|
"loss": 6.5013,
|
|
"mean_token_accuracy": 0.116908498108387,
|
|
"num_tokens": 2038732.0,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"entropy": 6.49971113204956,
|
|
"epoch": 0.9130094043887147,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004997837308144474,
|
|
"loss": 6.4002,
|
|
"mean_token_accuracy": 0.12262723073363305,
|
|
"num_tokens": 2046362.0,
|
|
"step": 1165
|
|
},
|
|
{
|
|
"entropy": 6.484364652633667,
|
|
"epoch": 0.9169278996865203,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004997703449234062,
|
|
"loss": 6.3721,
|
|
"mean_token_accuracy": 0.11172138154506683,
|
|
"num_tokens": 2056018.0,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"entropy": 6.384702682495117,
|
|
"epoch": 0.920846394984326,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004997565573313139,
|
|
"loss": 6.3547,
|
|
"mean_token_accuracy": 0.11563765183091164,
|
|
"num_tokens": 2065077.0,
|
|
"step": 1175
|
|
},
|
|
{
|
|
"entropy": 6.540685033798217,
|
|
"epoch": 0.9247648902821317,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004997423680628109,
|
|
"loss": 6.3473,
|
|
"mean_token_accuracy": 0.1140012837946415,
|
|
"num_tokens": 2073405.0,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"entropy": 6.3114136219024655,
|
|
"epoch": 0.9286833855799373,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000499727777143256,
|
|
"loss": 6.3174,
|
|
"mean_token_accuracy": 0.11831804886460304,
|
|
"num_tokens": 2081891.0,
|
|
"step": 1185
|
|
},
|
|
{
|
|
"entropy": 6.565793228149414,
|
|
"epoch": 0.932601880877743,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004997127845987255,
|
|
"loss": 6.3823,
|
|
"mean_token_accuracy": 0.11906470507383346,
|
|
"num_tokens": 2090732.0,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"entropy": 6.347681045532227,
|
|
"epoch": 0.9365203761755486,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004996973904560135,
|
|
"loss": 6.306,
|
|
"mean_token_accuracy": 0.12649127021431922,
|
|
"num_tokens": 2099352.0,
|
|
"step": 1195
|
|
},
|
|
{
|
|
"entropy": 6.5376229763031,
|
|
"epoch": 0.9404388714733543,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004996815947426317,
|
|
"loss": 6.346,
|
|
"mean_token_accuracy": 0.1162750244140625,
|
|
"num_tokens": 2107365.0,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"entropy": 6.353217506408692,
|
|
"epoch": 0.9443573667711599,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004996653974868097,
|
|
"loss": 6.218,
|
|
"mean_token_accuracy": 0.1250108003616333,
|
|
"num_tokens": 2116192.0,
|
|
"step": 1205
|
|
},
|
|
{
|
|
"entropy": 6.376781034469604,
|
|
"epoch": 0.9482758620689655,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004996487987174946,
|
|
"loss": 6.3167,
|
|
"mean_token_accuracy": 0.12126539349555969,
|
|
"num_tokens": 2124410.0,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"entropy": 6.452775669097901,
|
|
"epoch": 0.9521943573667712,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004996317984643512,
|
|
"loss": 6.3483,
|
|
"mean_token_accuracy": 0.11980568394064903,
|
|
"num_tokens": 2132799.0,
|
|
"step": 1215
|
|
},
|
|
{
|
|
"entropy": 6.459938097000122,
|
|
"epoch": 0.9561128526645768,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004996143967577615,
|
|
"loss": 6.4935,
|
|
"mean_token_accuracy": 0.11375040411949158,
|
|
"num_tokens": 2141589.0,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"entropy": 6.466125345230102,
|
|
"epoch": 0.9600313479623824,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004995965936288254,
|
|
"loss": 6.3714,
|
|
"mean_token_accuracy": 0.1200237862765789,
|
|
"num_tokens": 2150704.0,
|
|
"step": 1225
|
|
},
|
|
{
|
|
"entropy": 6.399578809738159,
|
|
"epoch": 0.9639498432601881,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004995783891093597,
|
|
"loss": 6.2958,
|
|
"mean_token_accuracy": 0.12307887598872184,
|
|
"num_tokens": 2159587.0,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"entropy": 6.330127573013305,
|
|
"epoch": 0.9678683385579937,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.000499559783231899,
|
|
"loss": 6.2928,
|
|
"mean_token_accuracy": 0.11578266024589538,
|
|
"num_tokens": 2169071.0,
|
|
"step": 1235
|
|
},
|
|
{
|
|
"entropy": 6.421966409683227,
|
|
"epoch": 0.9717868338557993,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004995407760296952,
|
|
"loss": 6.3688,
|
|
"mean_token_accuracy": 0.11582615077495576,
|
|
"num_tokens": 2178248.0,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"entropy": 6.425446796417236,
|
|
"epoch": 0.975705329153605,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004995213675367169,
|
|
"loss": 6.2732,
|
|
"mean_token_accuracy": 0.11606336981058121,
|
|
"num_tokens": 2187520.0,
|
|
"step": 1245
|
|
},
|
|
{
|
|
"entropy": 6.3828057765960695,
|
|
"epoch": 0.9796238244514106,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0004995015577876504,
|
|
"loss": 6.3415,
|
|
"mean_token_accuracy": 0.11500431299209594,
|
|
"num_tokens": 2196861.0,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"entropy": 6.361384725570678,
|
|
"epoch": 0.9835423197492164,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000499481346817899,
|
|
"loss": 6.2502,
|
|
"mean_token_accuracy": 0.12506693974137306,
|
|
"num_tokens": 2205523.0,
|
|
"step": 1255
|
|
},
|
|
{
|
|
"entropy": 6.345028877258301,
|
|
"epoch": 0.987460815047022,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004994607346635829,
|
|
"loss": 6.215,
|
|
"mean_token_accuracy": 0.12786355316638948,
|
|
"num_tokens": 2214291.0,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"entropy": 6.440300559997558,
|
|
"epoch": 0.9913793103448276,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004994397213615394,
|
|
"loss": 6.3295,
|
|
"mean_token_accuracy": 0.11995508670806884,
|
|
"num_tokens": 2223093.0,
|
|
"step": 1265
|
|
},
|
|
{
|
|
"entropy": 6.406629800796509,
|
|
"epoch": 0.9952978056426333,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004994183069493225,
|
|
"loss": 6.2832,
|
|
"mean_token_accuracy": 0.1210085429251194,
|
|
"num_tokens": 2231551.0,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"entropy": 6.222655868530273,
|
|
"epoch": 0.9992163009404389,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004993964914652034,
|
|
"loss": 6.2829,
|
|
"mean_token_accuracy": 0.1268168218433857,
|
|
"num_tokens": 2240650.0,
|
|
"step": 1275
|
|
},
|
|
{
|
|
"entropy": 6.511307573318481,
|
|
"epoch": 1.0031347962382444,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004993742749481699,
|
|
"loss": 6.2046,
|
|
"mean_token_accuracy": 0.12171700075268746,
|
|
"num_tokens": 2249392.0,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"entropy": 6.226445293426513,
|
|
"epoch": 1.0070532915360502,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004993516574379265,
|
|
"loss": 6.0221,
|
|
"mean_token_accuracy": 0.12889871895313262,
|
|
"num_tokens": 2258100.0,
|
|
"step": 1285
|
|
},
|
|
{
|
|
"entropy": 6.256803798675537,
|
|
"epoch": 1.0109717868338557,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004993286389748943,
|
|
"loss": 6.0869,
|
|
"mean_token_accuracy": 0.13081810474395753,
|
|
"num_tokens": 2266661.0,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"entropy": 6.348307514190674,
|
|
"epoch": 1.0148902821316614,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004993052196002112,
|
|
"loss": 6.1897,
|
|
"mean_token_accuracy": 0.12489164769649505,
|
|
"num_tokens": 2275475.0,
|
|
"step": 1295
|
|
},
|
|
{
|
|
"entropy": 6.266294240951538,
|
|
"epoch": 1.0188087774294672,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004992813993557312,
|
|
"loss": 6.1038,
|
|
"mean_token_accuracy": 0.12707522958517076,
|
|
"num_tokens": 2284077.0,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"entropy": 6.261200428009033,
|
|
"epoch": 1.0227272727272727,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000499257178284025,
|
|
"loss": 6.1127,
|
|
"mean_token_accuracy": 0.12442725002765656,
|
|
"num_tokens": 2293029.0,
|
|
"step": 1305
|
|
},
|
|
{
|
|
"entropy": 6.262345981597901,
|
|
"epoch": 1.0266457680250785,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004992325564283797,
|
|
"loss": 6.0351,
|
|
"mean_token_accuracy": 0.13578465953469276,
|
|
"num_tokens": 2301574.0,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"entropy": 6.176527118682861,
|
|
"epoch": 1.030564263322884,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004992075338327984,
|
|
"loss": 6.0354,
|
|
"mean_token_accuracy": 0.13138693422079087,
|
|
"num_tokens": 2310366.0,
|
|
"step": 1315
|
|
},
|
|
{
|
|
"entropy": 6.3460509300231935,
|
|
"epoch": 1.0344827586206897,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004991821105420006,
|
|
"loss": 6.1592,
|
|
"mean_token_accuracy": 0.1265966959297657,
|
|
"num_tokens": 2318943.0,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"entropy": 6.320439672470092,
|
|
"epoch": 1.0384012539184952,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004991562866014219,
|
|
"loss": 6.1486,
|
|
"mean_token_accuracy": 0.12730259895324708,
|
|
"num_tokens": 2327173.0,
|
|
"step": 1325
|
|
},
|
|
{
|
|
"entropy": 6.314588499069214,
|
|
"epoch": 1.042319749216301,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004991300620572138,
|
|
"loss": 6.1095,
|
|
"mean_token_accuracy": 0.1259642593562603,
|
|
"num_tokens": 2336194.0,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"entropy": 6.226988649368286,
|
|
"epoch": 1.0462382445141065,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004991034369562438,
|
|
"loss": 6.1824,
|
|
"mean_token_accuracy": 0.13008867129683493,
|
|
"num_tokens": 2344721.0,
|
|
"step": 1335
|
|
},
|
|
{
|
|
"entropy": 6.249088525772095,
|
|
"epoch": 1.0501567398119123,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004990764113460953,
|
|
"loss": 6.1052,
|
|
"mean_token_accuracy": 0.1238970473408699,
|
|
"num_tokens": 2353406.0,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"entropy": 6.400324296951294,
|
|
"epoch": 1.0540752351097178,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004990489852750675,
|
|
"loss": 6.2738,
|
|
"mean_token_accuracy": 0.11888742819428444,
|
|
"num_tokens": 2361896.0,
|
|
"step": 1345
|
|
},
|
|
{
|
|
"entropy": 6.248968696594238,
|
|
"epoch": 1.0579937304075235,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004990211587921751,
|
|
"loss": 6.1014,
|
|
"mean_token_accuracy": 0.1269259825348854,
|
|
"num_tokens": 2371861.0,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"entropy": 6.221920919418335,
|
|
"epoch": 1.061912225705329,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004989929319471487,
|
|
"loss": 6.0119,
|
|
"mean_token_accuracy": 0.13410131856799126,
|
|
"num_tokens": 2381555.0,
|
|
"step": 1355
|
|
},
|
|
{
|
|
"entropy": 6.21340970993042,
|
|
"epoch": 1.0658307210031348,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004989643047904341,
|
|
"loss": 6.019,
|
|
"mean_token_accuracy": 0.13659467175602913,
|
|
"num_tokens": 2389851.0,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"entropy": 6.126182651519775,
|
|
"epoch": 1.0697492163009406,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004989352773731928,
|
|
"loss": 5.9411,
|
|
"mean_token_accuracy": 0.13557685017585755,
|
|
"num_tokens": 2397764.0,
|
|
"step": 1365
|
|
},
|
|
{
|
|
"entropy": 6.2651125431060795,
|
|
"epoch": 1.073667711598746,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004989058497473014,
|
|
"loss": 6.1303,
|
|
"mean_token_accuracy": 0.13204967901110648,
|
|
"num_tokens": 2406688.0,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"entropy": 6.271961975097656,
|
|
"epoch": 1.0775862068965518,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004988760219653518,
|
|
"loss": 6.1871,
|
|
"mean_token_accuracy": 0.12600790411233903,
|
|
"num_tokens": 2415475.0,
|
|
"step": 1375
|
|
},
|
|
{
|
|
"entropy": 6.261941576004029,
|
|
"epoch": 1.0815047021943573,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004988457940806513,
|
|
"loss": 6.1287,
|
|
"mean_token_accuracy": 0.1264335960149765,
|
|
"num_tokens": 2425719.0,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"entropy": 6.207798719406128,
|
|
"epoch": 1.085423197492163,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004988151661472218,
|
|
"loss": 6.0311,
|
|
"mean_token_accuracy": 0.13111473768949508,
|
|
"num_tokens": 2435138.0,
|
|
"step": 1385
|
|
},
|
|
{
|
|
"entropy": 6.299766397476196,
|
|
"epoch": 1.0893416927899686,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004987841382198006,
|
|
"loss": 6.1646,
|
|
"mean_token_accuracy": 0.1282002493739128,
|
|
"num_tokens": 2443976.0,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"entropy": 6.123603677749633,
|
|
"epoch": 1.0932601880877744,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004987527103538394,
|
|
"loss": 6.0608,
|
|
"mean_token_accuracy": 0.1322481580078602,
|
|
"num_tokens": 2453287.0,
|
|
"step": 1395
|
|
},
|
|
{
|
|
"entropy": 6.353779220581055,
|
|
"epoch": 1.09717868338558,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000498720882605505,
|
|
"loss": 6.1542,
|
|
"mean_token_accuracy": 0.12652794942259787,
|
|
"num_tokens": 2462054.0,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"entropy": 6.106056785583496,
|
|
"epoch": 1.1010971786833856,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004986886550316788,
|
|
"loss": 6.0653,
|
|
"mean_token_accuracy": 0.13403623849153518,
|
|
"num_tokens": 2470333.0,
|
|
"step": 1405
|
|
},
|
|
{
|
|
"entropy": 6.214111185073852,
|
|
"epoch": 1.1050156739811912,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004986560276899565,
|
|
"loss": 6.0972,
|
|
"mean_token_accuracy": 0.1270233377814293,
|
|
"num_tokens": 2479621.0,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"entropy": 6.246433115005493,
|
|
"epoch": 1.108934169278997,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004986230006386488,
|
|
"loss": 6.0655,
|
|
"mean_token_accuracy": 0.12891049459576606,
|
|
"num_tokens": 2488020.0,
|
|
"step": 1415
|
|
},
|
|
{
|
|
"entropy": 6.274629163742065,
|
|
"epoch": 1.1128526645768024,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004985895739367802,
|
|
"loss": 6.1085,
|
|
"mean_token_accuracy": 0.12846623882651328,
|
|
"num_tokens": 2497779.0,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"entropy": 6.158310031890869,
|
|
"epoch": 1.1167711598746082,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004985557476440895,
|
|
"loss": 6.0415,
|
|
"mean_token_accuracy": 0.12804254293441772,
|
|
"num_tokens": 2506791.0,
|
|
"step": 1425
|
|
},
|
|
{
|
|
"entropy": 6.2934564590454105,
|
|
"epoch": 1.1206896551724137,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00049852152182103,
|
|
"loss": 6.1559,
|
|
"mean_token_accuracy": 0.1281921535730362,
|
|
"num_tokens": 2515576.0,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"entropy": 6.240061712265015,
|
|
"epoch": 1.1246081504702194,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004984868965287686,
|
|
"loss": 6.0674,
|
|
"mean_token_accuracy": 0.1283339627087116,
|
|
"num_tokens": 2523933.0,
|
|
"step": 1435
|
|
},
|
|
{
|
|
"entropy": 6.1820862770080565,
|
|
"epoch": 1.1285266457680252,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004984518718291864,
|
|
"loss": 6.0056,
|
|
"mean_token_accuracy": 0.12331884428858757,
|
|
"num_tokens": 2532453.0,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"entropy": 6.157241773605347,
|
|
"epoch": 1.1324451410658307,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004984164477848783,
|
|
"loss": 6.064,
|
|
"mean_token_accuracy": 0.12599845975637436,
|
|
"num_tokens": 2541149.0,
|
|
"step": 1445
|
|
},
|
|
{
|
|
"entropy": 6.237585258483887,
|
|
"epoch": 1.1363636363636362,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004983806244591528,
|
|
"loss": 6.0312,
|
|
"mean_token_accuracy": 0.13384631648659706,
|
|
"num_tokens": 2550302.0,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"entropy": 6.050166320800781,
|
|
"epoch": 1.140282131661442,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004983444019160318,
|
|
"loss": 5.8966,
|
|
"mean_token_accuracy": 0.13734075129032136,
|
|
"num_tokens": 2558507.0,
|
|
"step": 1455
|
|
},
|
|
{
|
|
"entropy": 6.258242225646972,
|
|
"epoch": 1.1442006269592477,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004983077802202511,
|
|
"loss": 6.1788,
|
|
"mean_token_accuracy": 0.1290736488997936,
|
|
"num_tokens": 2567487.0,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"entropy": 6.212359237670898,
|
|
"epoch": 1.1481191222570533,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004982707594372595,
|
|
"loss": 6.0304,
|
|
"mean_token_accuracy": 0.12699214443564416,
|
|
"num_tokens": 2576232.0,
|
|
"step": 1465
|
|
},
|
|
{
|
|
"entropy": 6.1208006858825685,
|
|
"epoch": 1.152037617554859,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000498233339633219,
|
|
"loss": 6.0329,
|
|
"mean_token_accuracy": 0.13209858015179635,
|
|
"num_tokens": 2585081.0,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"entropy": 6.143232536315918,
|
|
"epoch": 1.1559561128526645,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000498195520875005,
|
|
"loss": 6.0115,
|
|
"mean_token_accuracy": 0.12682496458292009,
|
|
"num_tokens": 2593890.0,
|
|
"step": 1475
|
|
},
|
|
{
|
|
"entropy": 6.200338554382324,
|
|
"epoch": 1.1598746081504703,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004981573032302059,
|
|
"loss": 6.0958,
|
|
"mean_token_accuracy": 0.1289630651473999,
|
|
"num_tokens": 2602011.0,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"entropy": 6.149346542358399,
|
|
"epoch": 1.1637931034482758,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004981186867671225,
|
|
"loss": 5.9939,
|
|
"mean_token_accuracy": 0.1361051805317402,
|
|
"num_tokens": 2610366.0,
|
|
"step": 1485
|
|
},
|
|
{
|
|
"entropy": 6.113060522079468,
|
|
"epoch": 1.1677115987460815,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004980796715547691,
|
|
"loss": 6.0004,
|
|
"mean_token_accuracy": 0.13451212123036385,
|
|
"num_tokens": 2619737.0,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"entropy": 6.229964303970337,
|
|
"epoch": 1.171630094043887,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004980402576628717,
|
|
"loss": 6.0233,
|
|
"mean_token_accuracy": 0.13166360333561897,
|
|
"num_tokens": 2628176.0,
|
|
"step": 1495
|
|
},
|
|
{
|
|
"entropy": 6.0950154781341555,
|
|
"epoch": 1.1755485893416928,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004980004451618697,
|
|
"loss": 6.0384,
|
|
"mean_token_accuracy": 0.13299553468823433,
|
|
"num_tokens": 2636195.0,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 1.1755485893416928,
|
|
"eval_entropy": 6.165283305700435,
|
|
"eval_loss": 6.313858985900879,
|
|
"eval_mean_token_accuracy": 0.1281498552105108,
|
|
"eval_num_tokens": 2636195.0,
|
|
"eval_runtime": 2.8434,
|
|
"eval_samples_per_second": 1449.67,
|
|
"eval_steps_per_second": 181.472,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"entropy": 6.294966459274292,
|
|
"epoch": 1.1794670846394983,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004979602341229144,
|
|
"loss": 6.1167,
|
|
"mean_token_accuracy": 0.13073519617319107,
|
|
"num_tokens": 2645035.0,
|
|
"step": 1505
|
|
},
|
|
{
|
|
"entropy": 6.177581834793091,
|
|
"epoch": 1.183385579937304,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004979196246178694,
|
|
"loss": 6.0716,
|
|
"mean_token_accuracy": 0.13222582265734673,
|
|
"num_tokens": 2653239.0,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"entropy": 6.1044244289398195,
|
|
"epoch": 1.1873040752351098,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004978786167193105,
|
|
"loss": 6.0998,
|
|
"mean_token_accuracy": 0.13036856576800346,
|
|
"num_tokens": 2662532.0,
|
|
"step": 1515
|
|
},
|
|
{
|
|
"entropy": 6.214647531509399,
|
|
"epoch": 1.1912225705329154,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004978372105005254,
|
|
"loss": 6.0841,
|
|
"mean_token_accuracy": 0.12801975160837173,
|
|
"num_tokens": 2671867.0,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"entropy": 6.091090679168701,
|
|
"epoch": 1.1951410658307209,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004977954060355139,
|
|
"loss": 6.0188,
|
|
"mean_token_accuracy": 0.1304849162697792,
|
|
"num_tokens": 2680693.0,
|
|
"step": 1525
|
|
},
|
|
{
|
|
"entropy": 6.281388568878174,
|
|
"epoch": 1.1990595611285266,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004977532033989871,
|
|
"loss": 6.0883,
|
|
"mean_token_accuracy": 0.1320122368633747,
|
|
"num_tokens": 2689228.0,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"entropy": 6.080573844909668,
|
|
"epoch": 1.2029780564263324,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004977106026663681,
|
|
"loss": 5.9666,
|
|
"mean_token_accuracy": 0.12833520472049714,
|
|
"num_tokens": 2698483.0,
|
|
"step": 1535
|
|
},
|
|
{
|
|
"entropy": 6.26170859336853,
|
|
"epoch": 1.206896551724138,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004976676039137914,
|
|
"loss": 6.1056,
|
|
"mean_token_accuracy": 0.126655612885952,
|
|
"num_tokens": 2706920.0,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"entropy": 6.0963341236114506,
|
|
"epoch": 1.2108150470219436,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004976242072181026,
|
|
"loss": 5.9939,
|
|
"mean_token_accuracy": 0.13806139603257178,
|
|
"num_tokens": 2715954.0,
|
|
"step": 1545
|
|
},
|
|
{
|
|
"entropy": 6.142190265655517,
|
|
"epoch": 1.2147335423197492,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004975804126568587,
|
|
"loss": 6.0112,
|
|
"mean_token_accuracy": 0.13544101044535636,
|
|
"num_tokens": 2724890.0,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"entropy": 6.178206443786621,
|
|
"epoch": 1.218652037617555,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004975362203083277,
|
|
"loss": 6.037,
|
|
"mean_token_accuracy": 0.12968128994107248,
|
|
"num_tokens": 2733688.0,
|
|
"step": 1555
|
|
},
|
|
{
|
|
"entropy": 6.206477451324463,
|
|
"epoch": 1.2225705329153604,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004974916302514886,
|
|
"loss": 6.0428,
|
|
"mean_token_accuracy": 0.12749146521091462,
|
|
"num_tokens": 2743364.0,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"entropy": 6.040554809570312,
|
|
"epoch": 1.2264890282131662,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004974466425660307,
|
|
"loss": 6.1001,
|
|
"mean_token_accuracy": 0.1249497301876545,
|
|
"num_tokens": 2754276.0,
|
|
"step": 1565
|
|
},
|
|
{
|
|
"entropy": 6.225854349136353,
|
|
"epoch": 1.2304075235109717,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004974012573323545,
|
|
"loss": 5.883,
|
|
"mean_token_accuracy": 0.14270371645689012,
|
|
"num_tokens": 2762680.0,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"entropy": 5.993971490859986,
|
|
"epoch": 1.2343260188087775,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004973554746315709,
|
|
"loss": 5.9893,
|
|
"mean_token_accuracy": 0.14108432978391647,
|
|
"num_tokens": 2771161.0,
|
|
"step": 1575
|
|
},
|
|
{
|
|
"entropy": 6.177520227432251,
|
|
"epoch": 1.238244514106583,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004973092945455009,
|
|
"loss": 5.9578,
|
|
"mean_token_accuracy": 0.14068967550992967,
|
|
"num_tokens": 2779321.0,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"entropy": 6.1567606925964355,
|
|
"epoch": 1.2421630094043887,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004972627171566757,
|
|
"loss": 6.1195,
|
|
"mean_token_accuracy": 0.13284459933638573,
|
|
"num_tokens": 2788425.0,
|
|
"step": 1585
|
|
},
|
|
{
|
|
"entropy": 6.097169017791748,
|
|
"epoch": 1.2460815047021945,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004972157425483368,
|
|
"loss": 5.9674,
|
|
"mean_token_accuracy": 0.1364126294851303,
|
|
"num_tokens": 2796747.0,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"entropy": 6.055204153060913,
|
|
"epoch": 1.25,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004971683708044353,
|
|
"loss": 6.0172,
|
|
"mean_token_accuracy": 0.1358514852821827,
|
|
"num_tokens": 2805335.0,
|
|
"step": 1595
|
|
},
|
|
{
|
|
"entropy": 6.133439636230468,
|
|
"epoch": 1.2539184952978055,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004971206020096323,
|
|
"loss": 5.9043,
|
|
"mean_token_accuracy": 0.13950337022542952,
|
|
"num_tokens": 2813350.0,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"entropy": 6.165687799453735,
|
|
"epoch": 1.2578369905956113,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004970724362492984,
|
|
"loss": 6.0934,
|
|
"mean_token_accuracy": 0.13864696100354196,
|
|
"num_tokens": 2821922.0,
|
|
"step": 1605
|
|
},
|
|
{
|
|
"entropy": 6.101494073867798,
|
|
"epoch": 1.261755485893417,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004970238736095135,
|
|
"loss": 5.9931,
|
|
"mean_token_accuracy": 0.1329360119998455,
|
|
"num_tokens": 2830832.0,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"entropy": 6.160137414932251,
|
|
"epoch": 1.2656739811912225,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004969749141770671,
|
|
"loss": 6.0053,
|
|
"mean_token_accuracy": 0.13711865544319152,
|
|
"num_tokens": 2839699.0,
|
|
"step": 1615
|
|
},
|
|
{
|
|
"entropy": 6.093396949768066,
|
|
"epoch": 1.2695924764890283,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004969255580394575,
|
|
"loss": 6.0811,
|
|
"mean_token_accuracy": 0.13654499500989914,
|
|
"num_tokens": 2848293.0,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"entropy": 6.201265144348144,
|
|
"epoch": 1.2735109717868338,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000496875805284892,
|
|
"loss": 6.0327,
|
|
"mean_token_accuracy": 0.13467409685254098,
|
|
"num_tokens": 2856468.0,
|
|
"step": 1625
|
|
},
|
|
{
|
|
"entropy": 6.086582183837891,
|
|
"epoch": 1.2774294670846396,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004968256560022871,
|
|
"loss": 5.9502,
|
|
"mean_token_accuracy": 0.1351684033870697,
|
|
"num_tokens": 2865366.0,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"entropy": 6.079522609710693,
|
|
"epoch": 1.281347962382445,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004967751102812676,
|
|
"loss": 6.0118,
|
|
"mean_token_accuracy": 0.13846911042928695,
|
|
"num_tokens": 2873520.0,
|
|
"step": 1635
|
|
},
|
|
{
|
|
"entropy": 6.133967542648316,
|
|
"epoch": 1.2852664576802508,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004967241682121669,
|
|
"loss": 6.0771,
|
|
"mean_token_accuracy": 0.13166362345218657,
|
|
"num_tokens": 2883324.0,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"entropy": 6.170493459701538,
|
|
"epoch": 1.2891849529780564,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004966728298860267,
|
|
"loss": 6.0178,
|
|
"mean_token_accuracy": 0.13498894423246383,
|
|
"num_tokens": 2892200.0,
|
|
"step": 1645
|
|
},
|
|
{
|
|
"entropy": 6.1070939064025875,
|
|
"epoch": 1.293103448275862,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004966210953945969,
|
|
"loss": 5.9858,
|
|
"mean_token_accuracy": 0.13817497938871384,
|
|
"num_tokens": 2900597.0,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"entropy": 6.192414903640747,
|
|
"epoch": 1.2970219435736676,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004965689648303355,
|
|
"loss": 6.0118,
|
|
"mean_token_accuracy": 0.13011416494846345,
|
|
"num_tokens": 2909869.0,
|
|
"step": 1655
|
|
},
|
|
{
|
|
"entropy": 6.000342845916748,
|
|
"epoch": 1.3009404388714734,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004965164382864083,
|
|
"loss": 6.0278,
|
|
"mean_token_accuracy": 0.1339656464755535,
|
|
"num_tokens": 2919166.0,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"entropy": 6.232674837112427,
|
|
"epoch": 1.3048589341692791,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004964635158566886,
|
|
"loss": 6.0383,
|
|
"mean_token_accuracy": 0.13142292499542235,
|
|
"num_tokens": 2927884.0,
|
|
"step": 1665
|
|
},
|
|
{
|
|
"entropy": 6.099806451797486,
|
|
"epoch": 1.3087774294670846,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004964101976357574,
|
|
"loss": 5.9802,
|
|
"mean_token_accuracy": 0.14010540917515754,
|
|
"num_tokens": 2936737.0,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"entropy": 6.117660140991211,
|
|
"epoch": 1.3126959247648902,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.000496356483718903,
|
|
"loss": 6.0705,
|
|
"mean_token_accuracy": 0.13366047590970992,
|
|
"num_tokens": 2944203.0,
|
|
"step": 1675
|
|
},
|
|
{
|
|
"entropy": 6.167582654953003,
|
|
"epoch": 1.316614420062696,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004963023742021211,
|
|
"loss": 6.0739,
|
|
"mean_token_accuracy": 0.1311576023697853,
|
|
"num_tokens": 2952639.0,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"entropy": 6.16412582397461,
|
|
"epoch": 1.3205329153605017,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000496247869182114,
|
|
"loss": 6.0037,
|
|
"mean_token_accuracy": 0.1342972233891487,
|
|
"num_tokens": 2961482.0,
|
|
"step": 1685
|
|
},
|
|
{
|
|
"entropy": 6.074027061462402,
|
|
"epoch": 1.3244514106583072,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004961929687562909,
|
|
"loss": 6.0286,
|
|
"mean_token_accuracy": 0.1309918761253357,
|
|
"num_tokens": 2970927.0,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"entropy": 6.046236276626587,
|
|
"epoch": 1.328369905956113,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004961376730227681,
|
|
"loss": 5.9461,
|
|
"mean_token_accuracy": 0.1402522951364517,
|
|
"num_tokens": 2979329.0,
|
|
"step": 1695
|
|
},
|
|
{
|
|
"entropy": 6.147798299789429,
|
|
"epoch": 1.3322884012539185,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004960819820803675,
|
|
"loss": 5.9328,
|
|
"mean_token_accuracy": 0.14073051139712334,
|
|
"num_tokens": 2986894.0,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"entropy": 6.0277563571929935,
|
|
"epoch": 1.3362068965517242,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004960258960286185,
|
|
"loss": 5.9188,
|
|
"mean_token_accuracy": 0.1406124599277973,
|
|
"num_tokens": 2994721.0,
|
|
"step": 1705
|
|
},
|
|
{
|
|
"entropy": 6.188552713394165,
|
|
"epoch": 1.3401253918495297,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004959694149677553,
|
|
"loss": 6.0716,
|
|
"mean_token_accuracy": 0.12887891680002211,
|
|
"num_tokens": 3003165.0,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"entropy": 6.130622291564942,
|
|
"epoch": 1.3440438871473355,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004959125389987193,
|
|
"loss": 6.0519,
|
|
"mean_token_accuracy": 0.12692190557718278,
|
|
"num_tokens": 3012178.0,
|
|
"step": 1715
|
|
},
|
|
{
|
|
"entropy": 6.024265241622925,
|
|
"epoch": 1.347962382445141,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004958552682231567,
|
|
"loss": 5.957,
|
|
"mean_token_accuracy": 0.1394257813692093,
|
|
"num_tokens": 3021685.0,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"entropy": 6.13150839805603,
|
|
"epoch": 1.3518808777429467,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004957976027434199,
|
|
"loss": 6.0119,
|
|
"mean_token_accuracy": 0.12856094017624856,
|
|
"num_tokens": 3030667.0,
|
|
"step": 1725
|
|
},
|
|
{
|
|
"entropy": 6.137048673629761,
|
|
"epoch": 1.3557993730407523,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004957395426625663,
|
|
"loss": 6.064,
|
|
"mean_token_accuracy": 0.13227416425943375,
|
|
"num_tokens": 3039501.0,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"entropy": 6.055467891693115,
|
|
"epoch": 1.359717868338558,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004956810880843587,
|
|
"loss": 5.9946,
|
|
"mean_token_accuracy": 0.13389588594436647,
|
|
"num_tokens": 3048547.0,
|
|
"step": 1735
|
|
},
|
|
{
|
|
"entropy": 6.150904512405395,
|
|
"epoch": 1.3636363636363638,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000495622239113265,
|
|
"loss": 5.9482,
|
|
"mean_token_accuracy": 0.13820023238658904,
|
|
"num_tokens": 3056521.0,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"entropy": 6.032876205444336,
|
|
"epoch": 1.3675548589341693,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004955629958544577,
|
|
"loss": 5.9428,
|
|
"mean_token_accuracy": 0.1336909145116806,
|
|
"num_tokens": 3066007.0,
|
|
"step": 1745
|
|
},
|
|
{
|
|
"entropy": 6.174657106399536,
|
|
"epoch": 1.3714733542319748,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004955033584138143,
|
|
"loss": 6.1364,
|
|
"mean_token_accuracy": 0.12483339086174965,
|
|
"num_tokens": 3075111.0,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"entropy": 6.001473474502563,
|
|
"epoch": 1.3753918495297806,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004954433268979164,
|
|
"loss": 5.93,
|
|
"mean_token_accuracy": 0.15043148621916771,
|
|
"num_tokens": 3083766.0,
|
|
"step": 1755
|
|
},
|
|
{
|
|
"entropy": 6.056762075424194,
|
|
"epoch": 1.3793103448275863,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004953829014140502,
|
|
"loss": 6.0034,
|
|
"mean_token_accuracy": 0.13454223051667213,
|
|
"num_tokens": 3092103.0,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"entropy": 6.0849854946136475,
|
|
"epoch": 1.3832288401253918,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004953220820702057,
|
|
"loss": 6.0065,
|
|
"mean_token_accuracy": 0.13198206946253777,
|
|
"num_tokens": 3101286.0,
|
|
"step": 1765
|
|
},
|
|
{
|
|
"entropy": 6.222050523757934,
|
|
"epoch": 1.3871473354231976,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004952608689750771,
|
|
"loss": 6.024,
|
|
"mean_token_accuracy": 0.13246920630335807,
|
|
"num_tokens": 3110554.0,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"entropy": 5.981888675689698,
|
|
"epoch": 1.391065830721003,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004951992622380619,
|
|
"loss": 5.8669,
|
|
"mean_token_accuracy": 0.14516761153936386,
|
|
"num_tokens": 3119265.0,
|
|
"step": 1775
|
|
},
|
|
{
|
|
"entropy": 6.022592401504516,
|
|
"epoch": 1.3949843260188088,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004951372619692615,
|
|
"loss": 5.9176,
|
|
"mean_token_accuracy": 0.13864269405603408,
|
|
"num_tokens": 3127962.0,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"entropy": 6.138263750076294,
|
|
"epoch": 1.3989028213166144,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004950748682794804,
|
|
"loss": 6.0343,
|
|
"mean_token_accuracy": 0.13279442861676216,
|
|
"num_tokens": 3137265.0,
|
|
"step": 1785
|
|
},
|
|
{
|
|
"entropy": 5.982170152664184,
|
|
"epoch": 1.40282131661442,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004950120812802262,
|
|
"loss": 5.8679,
|
|
"mean_token_accuracy": 0.13771192952990532,
|
|
"num_tokens": 3146458.0,
|
|
"step": 1790
|
|
},
|
|
{
|
|
"entropy": 6.121512508392334,
|
|
"epoch": 1.4067398119122256,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004949489010837095,
|
|
"loss": 6.0051,
|
|
"mean_token_accuracy": 0.13202869817614554,
|
|
"num_tokens": 3155557.0,
|
|
"step": 1795
|
|
},
|
|
{
|
|
"entropy": 6.031117582321167,
|
|
"epoch": 1.4106583072100314,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004948853278028436,
|
|
"loss": 5.8921,
|
|
"mean_token_accuracy": 0.1407647594809532,
|
|
"num_tokens": 3163478.0,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"entropy": 6.06480302810669,
|
|
"epoch": 1.414576802507837,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.000494821361551244,
|
|
"loss": 5.9734,
|
|
"mean_token_accuracy": 0.13325524404644967,
|
|
"num_tokens": 3172632.0,
|
|
"step": 1805
|
|
},
|
|
{
|
|
"entropy": 6.034041261672973,
|
|
"epoch": 1.4184952978056427,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004947570024432291,
|
|
"loss": 6.0159,
|
|
"mean_token_accuracy": 0.14027554914355278,
|
|
"num_tokens": 3181325.0,
|
|
"step": 1810
|
|
},
|
|
{
|
|
"entropy": 6.120739078521728,
|
|
"epoch": 1.4224137931034484,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004946922505938189,
|
|
"loss": 6.058,
|
|
"mean_token_accuracy": 0.140015921741724,
|
|
"num_tokens": 3189783.0,
|
|
"step": 1815
|
|
},
|
|
{
|
|
"entropy": 6.078346109390258,
|
|
"epoch": 1.426332288401254,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004946271061187354,
|
|
"loss": 5.9115,
|
|
"mean_token_accuracy": 0.1306297406554222,
|
|
"num_tokens": 3198605.0,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"entropy": 6.07373309135437,
|
|
"epoch": 1.4302507836990594,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004945615691344025,
|
|
"loss": 6.0025,
|
|
"mean_token_accuracy": 0.12800363823771477,
|
|
"num_tokens": 3208061.0,
|
|
"step": 1825
|
|
},
|
|
{
|
|
"entropy": 6.044371461868286,
|
|
"epoch": 1.4341692789968652,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004944956397579453,
|
|
"loss": 5.9286,
|
|
"mean_token_accuracy": 0.1459761567413807,
|
|
"num_tokens": 3216093.0,
|
|
"step": 1830
|
|
},
|
|
{
|
|
"entropy": 6.0173381805419925,
|
|
"epoch": 1.438087774294671,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004944293181071902,
|
|
"loss": 5.9027,
|
|
"mean_token_accuracy": 0.14357266277074815,
|
|
"num_tokens": 3224446.0,
|
|
"step": 1835
|
|
},
|
|
{
|
|
"entropy": 6.0938514232635494,
|
|
"epoch": 1.4420062695924765,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004943626043006649,
|
|
"loss": 6.0177,
|
|
"mean_token_accuracy": 0.135706390440464,
|
|
"num_tokens": 3233398.0,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"entropy": 6.068123865127563,
|
|
"epoch": 1.4459247648902822,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.000494295498457598,
|
|
"loss": 6.0156,
|
|
"mean_token_accuracy": 0.13109127432107925,
|
|
"num_tokens": 3243391.0,
|
|
"step": 1845
|
|
},
|
|
{
|
|
"entropy": 6.0765832424163815,
|
|
"epoch": 1.4498432601880877,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004942280006979179,
|
|
"loss": 5.96,
|
|
"mean_token_accuracy": 0.13749199137091636,
|
|
"num_tokens": 3251954.0,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"entropy": 6.0438025951385494,
|
|
"epoch": 1.4537617554858935,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004941601111422546,
|
|
"loss": 5.9851,
|
|
"mean_token_accuracy": 0.1373910054564476,
|
|
"num_tokens": 3261121.0,
|
|
"step": 1855
|
|
},
|
|
{
|
|
"entropy": 5.976307153701782,
|
|
"epoch": 1.457680250783699,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004940918299119375,
|
|
"loss": 5.9028,
|
|
"mean_token_accuracy": 0.1375894144177437,
|
|
"num_tokens": 3270179.0,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"entropy": 5.898191833496094,
|
|
"epoch": 1.4615987460815048,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004940231571289962,
|
|
"loss": 5.8893,
|
|
"mean_token_accuracy": 0.14039459228515624,
|
|
"num_tokens": 3279749.0,
|
|
"step": 1865
|
|
},
|
|
{
|
|
"entropy": 6.1549577713012695,
|
|
"epoch": 1.4655172413793103,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004939540929161603,
|
|
"loss": 5.9246,
|
|
"mean_token_accuracy": 0.13927078545093535,
|
|
"num_tokens": 3288458.0,
|
|
"step": 1870
|
|
},
|
|
{
|
|
"entropy": 6.00097017288208,
|
|
"epoch": 1.469435736677116,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004938846373968586,
|
|
"loss": 5.8991,
|
|
"mean_token_accuracy": 0.14272075444459914,
|
|
"num_tokens": 3297231.0,
|
|
"step": 1875
|
|
},
|
|
{
|
|
"entropy": 6.032529544830322,
|
|
"epoch": 1.4733542319749215,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004938147906952194,
|
|
"loss": 6.0329,
|
|
"mean_token_accuracy": 0.13757839426398277,
|
|
"num_tokens": 3306915.0,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"entropy": 6.079828405380249,
|
|
"epoch": 1.4772727272727273,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00049374455293607,
|
|
"loss": 5.9958,
|
|
"mean_token_accuracy": 0.13423861265182496,
|
|
"num_tokens": 3315985.0,
|
|
"step": 1885
|
|
},
|
|
{
|
|
"entropy": 6.122959566116333,
|
|
"epoch": 1.481191222570533,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004936739242449369,
|
|
"loss": 5.9895,
|
|
"mean_token_accuracy": 0.13724515214562416,
|
|
"num_tokens": 3324924.0,
|
|
"step": 1890
|
|
},
|
|
{
|
|
"entropy": 5.991518545150757,
|
|
"epoch": 1.4851097178683386,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004936029047480447,
|
|
"loss": 5.9801,
|
|
"mean_token_accuracy": 0.13726715967059136,
|
|
"num_tokens": 3333464.0,
|
|
"step": 1895
|
|
},
|
|
{
|
|
"entropy": 6.0822971820831295,
|
|
"epoch": 1.489028213166144,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004935314945723171,
|
|
"loss": 5.8806,
|
|
"mean_token_accuracy": 0.14093400090932845,
|
|
"num_tokens": 3342441.0,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"entropy": 5.940945720672607,
|
|
"epoch": 1.4929467084639498,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004934596938453754,
|
|
"loss": 5.9352,
|
|
"mean_token_accuracy": 0.1342185415327549,
|
|
"num_tokens": 3351338.0,
|
|
"step": 1905
|
|
},
|
|
{
|
|
"entropy": 6.092441558837891,
|
|
"epoch": 1.4968652037617556,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004933875026955391,
|
|
"loss": 5.9069,
|
|
"mean_token_accuracy": 0.1339087277650833,
|
|
"num_tokens": 3360356.0,
|
|
"step": 1910
|
|
},
|
|
{
|
|
"entropy": 6.088913869857788,
|
|
"epoch": 1.500783699059561,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004933149212518258,
|
|
"loss": 5.988,
|
|
"mean_token_accuracy": 0.13323872461915015,
|
|
"num_tokens": 3369485.0,
|
|
"step": 1915
|
|
},
|
|
{
|
|
"entropy": 6.02648138999939,
|
|
"epoch": 1.5047021943573666,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004932419496439501,
|
|
"loss": 5.9519,
|
|
"mean_token_accuracy": 0.14375862777233123,
|
|
"num_tokens": 3378355.0,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"entropy": 5.99138445854187,
|
|
"epoch": 1.5086206896551724,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000493168588002324,
|
|
"loss": 5.8649,
|
|
"mean_token_accuracy": 0.14118586033582686,
|
|
"num_tokens": 3386729.0,
|
|
"step": 1925
|
|
},
|
|
{
|
|
"entropy": 6.119981384277343,
|
|
"epoch": 1.5125391849529781,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004930948364580569,
|
|
"loss": 6.088,
|
|
"mean_token_accuracy": 0.1284665696322918,
|
|
"num_tokens": 3395822.0,
|
|
"step": 1930
|
|
},
|
|
{
|
|
"entropy": 6.04303822517395,
|
|
"epoch": 1.5164576802507836,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004930206951429546,
|
|
"loss": 5.99,
|
|
"mean_token_accuracy": 0.14219039529561997,
|
|
"num_tokens": 3404671.0,
|
|
"step": 1935
|
|
},
|
|
{
|
|
"entropy": 6.036664438247681,
|
|
"epoch": 1.5203761755485894,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004929461641895197,
|
|
"loss": 5.9301,
|
|
"mean_token_accuracy": 0.13862462490797042,
|
|
"num_tokens": 3413289.0,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"entropy": 6.010977125167846,
|
|
"epoch": 1.5242946708463951,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000492871243730951,
|
|
"loss": 5.9675,
|
|
"mean_token_accuracy": 0.1358363598585129,
|
|
"num_tokens": 3421960.0,
|
|
"step": 1945
|
|
},
|
|
{
|
|
"entropy": 6.0582923412323,
|
|
"epoch": 1.5282131661442007,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004927959339011437,
|
|
"loss": 5.9564,
|
|
"mean_token_accuracy": 0.13902525305747987,
|
|
"num_tokens": 3431958.0,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"entropy": 5.846984767913819,
|
|
"epoch": 1.5321316614420062,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004927202348346885,
|
|
"loss": 5.7801,
|
|
"mean_token_accuracy": 0.14725423008203506,
|
|
"num_tokens": 3441438.0,
|
|
"step": 1955
|
|
},
|
|
{
|
|
"entropy": 6.118220949172974,
|
|
"epoch": 1.536050156739812,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.000492644146666872,
|
|
"loss": 5.9105,
|
|
"mean_token_accuracy": 0.14317999482154847,
|
|
"num_tokens": 3450176.0,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"entropy": 5.923649597167969,
|
|
"epoch": 1.5399686520376177,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004925676695336761,
|
|
"loss": 5.8946,
|
|
"mean_token_accuracy": 0.14172052592039108,
|
|
"num_tokens": 3458685.0,
|
|
"step": 1965
|
|
},
|
|
{
|
|
"entropy": 6.046487474441529,
|
|
"epoch": 1.5438871473354232,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004924908035717777,
|
|
"loss": 6.0445,
|
|
"mean_token_accuracy": 0.13102332279086112,
|
|
"num_tokens": 3468362.0,
|
|
"step": 1970
|
|
},
|
|
{
|
|
"entropy": 6.12939486503601,
|
|
"epoch": 1.5478056426332287,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000492413548918549,
|
|
"loss": 5.9607,
|
|
"mean_token_accuracy": 0.13970244973897933,
|
|
"num_tokens": 3476791.0,
|
|
"step": 1975
|
|
},
|
|
{
|
|
"entropy": 5.925748825073242,
|
|
"epoch": 1.5517241379310345,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004923359057120563,
|
|
"loss": 5.8781,
|
|
"mean_token_accuracy": 0.13300700336694718,
|
|
"num_tokens": 3486486.0,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"entropy": 5.951593446731567,
|
|
"epoch": 1.5556426332288402,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004922578740910608,
|
|
"loss": 5.8334,
|
|
"mean_token_accuracy": 0.14717195332050323,
|
|
"num_tokens": 3496145.0,
|
|
"step": 1985
|
|
},
|
|
{
|
|
"entropy": 5.967483854293823,
|
|
"epoch": 1.5595611285266457,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004921794541950177,
|
|
"loss": 5.8214,
|
|
"mean_token_accuracy": 0.14525392055511474,
|
|
"num_tokens": 3504644.0,
|
|
"step": 1990
|
|
},
|
|
{
|
|
"entropy": 5.994989633560181,
|
|
"epoch": 1.5634796238244513,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004921006461640758,
|
|
"loss": 5.8562,
|
|
"mean_token_accuracy": 0.13699238896369934,
|
|
"num_tokens": 3513126.0,
|
|
"step": 1995
|
|
},
|
|
{
|
|
"entropy": 5.833573675155639,
|
|
"epoch": 1.567398119122257,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.000492021450139078,
|
|
"loss": 5.7981,
|
|
"mean_token_accuracy": 0.14626943692564964,
|
|
"num_tokens": 3521505.0,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 1.567398119122257,
|
|
"eval_entropy": 5.888265516406806,
|
|
"eval_loss": 6.116030216217041,
|
|
"eval_mean_token_accuracy": 0.13753249434855327,
|
|
"eval_num_tokens": 3521505.0,
|
|
"eval_runtime": 2.8299,
|
|
"eval_samples_per_second": 1456.593,
|
|
"eval_steps_per_second": 182.339,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"entropy": 6.132147789001465,
|
|
"epoch": 1.5713166144200628,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004919418662615605,
|
|
"loss": 6.0583,
|
|
"mean_token_accuracy": 0.1335877738893032,
|
|
"num_tokens": 3530008.0,
|
|
"step": 2005
|
|
},
|
|
{
|
|
"entropy": 6.083332824707031,
|
|
"epoch": 1.5752351097178683,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004918618946737525,
|
|
"loss": 5.9278,
|
|
"mean_token_accuracy": 0.13504896759986879,
|
|
"num_tokens": 3538778.0,
|
|
"step": 2010
|
|
},
|
|
{
|
|
"entropy": 5.979771709442138,
|
|
"epoch": 1.579153605015674,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004917815355185762,
|
|
"loss": 5.9164,
|
|
"mean_token_accuracy": 0.13633493483066558,
|
|
"num_tokens": 3547108.0,
|
|
"step": 2015
|
|
},
|
|
{
|
|
"entropy": 5.988805103302002,
|
|
"epoch": 1.5830721003134798,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004917007889396464,
|
|
"loss": 5.9513,
|
|
"mean_token_accuracy": 0.13188610523939132,
|
|
"num_tokens": 3556094.0,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"entropy": 5.985982990264892,
|
|
"epoch": 1.5869905956112853,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004916196550812706,
|
|
"loss": 5.8151,
|
|
"mean_token_accuracy": 0.14568774104118348,
|
|
"num_tokens": 3564222.0,
|
|
"step": 2025
|
|
},
|
|
{
|
|
"entropy": 5.818160581588745,
|
|
"epoch": 1.5909090909090908,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004915381340884477,
|
|
"loss": 5.827,
|
|
"mean_token_accuracy": 0.14547126069664956,
|
|
"num_tokens": 3573123.0,
|
|
"step": 2030
|
|
},
|
|
{
|
|
"entropy": 6.050127172470093,
|
|
"epoch": 1.5948275862068966,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004914562261068693,
|
|
"loss": 5.8571,
|
|
"mean_token_accuracy": 0.14239953309297562,
|
|
"num_tokens": 3581462.0,
|
|
"step": 2035
|
|
},
|
|
{
|
|
"entropy": 5.88056173324585,
|
|
"epoch": 1.5987460815047023,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004913739312829181,
|
|
"loss": 5.8744,
|
|
"mean_token_accuracy": 0.14017535969614983,
|
|
"num_tokens": 3590718.0,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"entropy": 6.000182294845581,
|
|
"epoch": 1.6026645768025078,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004912912497636683,
|
|
"loss": 5.8899,
|
|
"mean_token_accuracy": 0.13969212546944618,
|
|
"num_tokens": 3599890.0,
|
|
"step": 2045
|
|
},
|
|
{
|
|
"entropy": 6.039797258377075,
|
|
"epoch": 1.6065830721003134,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004912081816968853,
|
|
"loss": 5.9815,
|
|
"mean_token_accuracy": 0.14007024392485617,
|
|
"num_tokens": 3607927.0,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"entropy": 6.0202032089233395,
|
|
"epoch": 1.6105015673981191,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000491124727231025,
|
|
"loss": 5.838,
|
|
"mean_token_accuracy": 0.14422516226768495,
|
|
"num_tokens": 3615935.0,
|
|
"step": 2055
|
|
},
|
|
{
|
|
"entropy": 5.8747539043426515,
|
|
"epoch": 1.6144200626959249,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004910408865152343,
|
|
"loss": 5.8654,
|
|
"mean_token_accuracy": 0.1397896021604538,
|
|
"num_tokens": 3624475.0,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"entropy": 6.097391796112061,
|
|
"epoch": 1.6183385579937304,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004909566596993498,
|
|
"loss": 6.072,
|
|
"mean_token_accuracy": 0.131089448928833,
|
|
"num_tokens": 3633995.0,
|
|
"step": 2065
|
|
},
|
|
{
|
|
"entropy": 6.067581701278686,
|
|
"epoch": 1.622257053291536,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004908720469338988,
|
|
"loss": 5.9294,
|
|
"mean_token_accuracy": 0.14296501129865646,
|
|
"num_tokens": 3643121.0,
|
|
"step": 2070
|
|
},
|
|
{
|
|
"entropy": 5.959115791320801,
|
|
"epoch": 1.6261755485893417,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004907870483700979,
|
|
"loss": 5.8621,
|
|
"mean_token_accuracy": 0.14175319969654082,
|
|
"num_tokens": 3652285.0,
|
|
"step": 2075
|
|
},
|
|
{
|
|
"entropy": 5.929019594192505,
|
|
"epoch": 1.6300940438871474,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004907016641598534,
|
|
"loss": 5.926,
|
|
"mean_token_accuracy": 0.13432129770517348,
|
|
"num_tokens": 3661441.0,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"entropy": 6.0357630252838135,
|
|
"epoch": 1.634012539184953,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004906158944557607,
|
|
"loss": 5.8105,
|
|
"mean_token_accuracy": 0.14453670606017113,
|
|
"num_tokens": 3669364.0,
|
|
"step": 2085
|
|
},
|
|
{
|
|
"entropy": 5.9998321533203125,
|
|
"epoch": 1.6379310344827587,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000490529739411104,
|
|
"loss": 5.9024,
|
|
"mean_token_accuracy": 0.13794894218444825,
|
|
"num_tokens": 3677848.0,
|
|
"step": 2090
|
|
},
|
|
{
|
|
"entropy": 5.93680510520935,
|
|
"epoch": 1.6418495297805644,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004904431991798565,
|
|
"loss": 5.8138,
|
|
"mean_token_accuracy": 0.14087132290005683,
|
|
"num_tokens": 3686617.0,
|
|
"step": 2095
|
|
},
|
|
{
|
|
"entropy": 5.955365371704102,
|
|
"epoch": 1.64576802507837,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004903562739166797,
|
|
"loss": 5.8044,
|
|
"mean_token_accuracy": 0.1422215446829796,
|
|
"num_tokens": 3695106.0,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"entropy": 5.7903650283813475,
|
|
"epoch": 1.6496865203761755,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004902689637769229,
|
|
"loss": 5.7518,
|
|
"mean_token_accuracy": 0.15124305188655854,
|
|
"num_tokens": 3703167.0,
|
|
"step": 2105
|
|
},
|
|
{
|
|
"entropy": 5.9747395515441895,
|
|
"epoch": 1.6536050156739812,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004901812689166237,
|
|
"loss": 5.8666,
|
|
"mean_token_accuracy": 0.13829359784722328,
|
|
"num_tokens": 3711597.0,
|
|
"step": 2110
|
|
},
|
|
{
|
|
"entropy": 5.945003080368042,
|
|
"epoch": 1.657523510971787,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004900931894925069,
|
|
"loss": 5.9488,
|
|
"mean_token_accuracy": 0.14057869464159012,
|
|
"num_tokens": 3720277.0,
|
|
"step": 2115
|
|
},
|
|
{
|
|
"entropy": 5.949292469024658,
|
|
"epoch": 1.6614420062695925,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004900047256619849,
|
|
"loss": 5.921,
|
|
"mean_token_accuracy": 0.13772802650928498,
|
|
"num_tokens": 3729832.0,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"entropy": 5.9761933326721195,
|
|
"epoch": 1.665360501567398,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004899158775831566,
|
|
"loss": 5.8632,
|
|
"mean_token_accuracy": 0.14447186067700385,
|
|
"num_tokens": 3738390.0,
|
|
"step": 2125
|
|
},
|
|
{
|
|
"entropy": 5.933046817779541,
|
|
"epoch": 1.6692789968652038,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004898266454148081,
|
|
"loss": 5.9185,
|
|
"mean_token_accuracy": 0.14356547445058823,
|
|
"num_tokens": 3747046.0,
|
|
"step": 2130
|
|
},
|
|
{
|
|
"entropy": 5.945548963546753,
|
|
"epoch": 1.6731974921630095,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004897370293164119,
|
|
"loss": 5.8896,
|
|
"mean_token_accuracy": 0.13832552805542947,
|
|
"num_tokens": 3755818.0,
|
|
"step": 2135
|
|
},
|
|
{
|
|
"entropy": 6.093547344207764,
|
|
"epoch": 1.677115987460815,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004896470294481262,
|
|
"loss": 5.9861,
|
|
"mean_token_accuracy": 0.13742954656481743,
|
|
"num_tokens": 3764123.0,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"entropy": 5.950912809371948,
|
|
"epoch": 1.6810344827586206,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004895566459707954,
|
|
"loss": 6.035,
|
|
"mean_token_accuracy": 0.12903214767575263,
|
|
"num_tokens": 3773105.0,
|
|
"step": 2145
|
|
},
|
|
{
|
|
"entropy": 6.076629734039306,
|
|
"epoch": 1.6849529780564263,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004894658790459498,
|
|
"loss": 5.8805,
|
|
"mean_token_accuracy": 0.14274725392460824,
|
|
"num_tokens": 3781417.0,
|
|
"step": 2150
|
|
},
|
|
{
|
|
"entropy": 6.058791875839233,
|
|
"epoch": 1.688871473354232,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004893747288358041,
|
|
"loss": 5.9792,
|
|
"mean_token_accuracy": 0.13869686052203178,
|
|
"num_tokens": 3789657.0,
|
|
"step": 2155
|
|
},
|
|
{
|
|
"entropy": 5.9904515743255615,
|
|
"epoch": 1.6927899686520376,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000489283195503259,
|
|
"loss": 5.9875,
|
|
"mean_token_accuracy": 0.13768337592482566,
|
|
"num_tokens": 3798700.0,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"entropy": 5.9594886302948,
|
|
"epoch": 1.6967084639498433,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.000489191279211899,
|
|
"loss": 5.8479,
|
|
"mean_token_accuracy": 0.14359756112098693,
|
|
"num_tokens": 3807299.0,
|
|
"step": 2165
|
|
},
|
|
{
|
|
"entropy": 6.006121349334717,
|
|
"epoch": 1.700626959247649,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004890989801259935,
|
|
"loss": 5.9385,
|
|
"mean_token_accuracy": 0.14270951524376868,
|
|
"num_tokens": 3816292.0,
|
|
"step": 2170
|
|
},
|
|
{
|
|
"entropy": 6.049522542953492,
|
|
"epoch": 1.7045454545454546,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000489006298410496,
|
|
"loss": 5.8532,
|
|
"mean_token_accuracy": 0.1399511620402336,
|
|
"num_tokens": 3825070.0,
|
|
"step": 2175
|
|
},
|
|
{
|
|
"entropy": 5.959496307373047,
|
|
"epoch": 1.70846394984326,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004889132342310438,
|
|
"loss": 5.8247,
|
|
"mean_token_accuracy": 0.14473102912306784,
|
|
"num_tokens": 3834016.0,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"entropy": 5.956418132781982,
|
|
"epoch": 1.7123824451410659,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004888197877539577,
|
|
"loss": 5.8849,
|
|
"mean_token_accuracy": 0.14025031253695489,
|
|
"num_tokens": 3842172.0,
|
|
"step": 2185
|
|
},
|
|
{
|
|
"entropy": 5.971714496612549,
|
|
"epoch": 1.7163009404388716,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004887259591462417,
|
|
"loss": 5.798,
|
|
"mean_token_accuracy": 0.14462782070040703,
|
|
"num_tokens": 3851109.0,
|
|
"step": 2190
|
|
},
|
|
{
|
|
"entropy": 5.923008251190185,
|
|
"epoch": 1.7202194357366771,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004886317485755825,
|
|
"loss": 5.924,
|
|
"mean_token_accuracy": 0.14367973506450654,
|
|
"num_tokens": 3859656.0,
|
|
"step": 2195
|
|
},
|
|
{
|
|
"entropy": 5.933929538726806,
|
|
"epoch": 1.7241379310344827,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004885371562103498,
|
|
"loss": 5.7677,
|
|
"mean_token_accuracy": 0.14872414171695708,
|
|
"num_tokens": 3868698.0,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"entropy": 5.8945310592651365,
|
|
"epoch": 1.7280564263322884,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004884421822195957,
|
|
"loss": 5.8545,
|
|
"mean_token_accuracy": 0.1411299616098404,
|
|
"num_tokens": 3877474.0,
|
|
"step": 2205
|
|
},
|
|
{
|
|
"entropy": 5.9916675090789795,
|
|
"epoch": 1.7319749216300941,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004883468267730538,
|
|
"loss": 5.8228,
|
|
"mean_token_accuracy": 0.1443895533680916,
|
|
"num_tokens": 3886328.0,
|
|
"step": 2210
|
|
},
|
|
{
|
|
"entropy": 5.906041145324707,
|
|
"epoch": 1.7358934169278997,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00048825109004114006,
|
|
"loss": 5.8058,
|
|
"mean_token_accuracy": 0.1487313315272331,
|
|
"num_tokens": 3894424.0,
|
|
"step": 2215
|
|
},
|
|
{
|
|
"entropy": 5.978779983520508,
|
|
"epoch": 1.7398119122257052,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004881549721949513,
|
|
"loss": 5.8897,
|
|
"mean_token_accuracy": 0.13909043669700621,
|
|
"num_tokens": 3903746.0,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"entropy": 5.851698732376098,
|
|
"epoch": 1.743730407523511,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004880584734062658,
|
|
"loss": 5.8724,
|
|
"mean_token_accuracy": 0.14629912972450257,
|
|
"num_tokens": 3912033.0,
|
|
"step": 2225
|
|
},
|
|
{
|
|
"entropy": 5.883364534378051,
|
|
"epoch": 1.7476489028213167,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004879615938475425,
|
|
"loss": 5.7739,
|
|
"mean_token_accuracy": 0.14114395081996917,
|
|
"num_tokens": 3920388.0,
|
|
"step": 2230
|
|
},
|
|
{
|
|
"entropy": 5.934035730361939,
|
|
"epoch": 1.7515673981191222,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004878643336919209,
|
|
"loss": 5.9772,
|
|
"mean_token_accuracy": 0.139875166118145,
|
|
"num_tokens": 3929926.0,
|
|
"step": 2235
|
|
},
|
|
{
|
|
"entropy": 6.131916475296021,
|
|
"epoch": 1.7554858934169277,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004877666931132206,
|
|
"loss": 5.9601,
|
|
"mean_token_accuracy": 0.13168897181749345,
|
|
"num_tokens": 3938696.0,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"entropy": 5.873245096206665,
|
|
"epoch": 1.7594043887147337,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004876686722859413,
|
|
"loss": 5.7899,
|
|
"mean_token_accuracy": 0.1513090804219246,
|
|
"num_tokens": 3947537.0,
|
|
"step": 2245
|
|
},
|
|
{
|
|
"entropy": 5.992959451675415,
|
|
"epoch": 1.7633228840125392,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000487570271385262,
|
|
"loss": 5.7993,
|
|
"mean_token_accuracy": 0.14130587950348855,
|
|
"num_tokens": 3956539.0,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"entropy": 5.880547904968262,
|
|
"epoch": 1.7672413793103448,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004874714905870411,
|
|
"loss": 5.7667,
|
|
"mean_token_accuracy": 0.14465454295277597,
|
|
"num_tokens": 3964927.0,
|
|
"step": 2255
|
|
},
|
|
{
|
|
"entropy": 5.923993968963623,
|
|
"epoch": 1.7711598746081505,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004873723300678159,
|
|
"loss": 5.9062,
|
|
"mean_token_accuracy": 0.1402622014284134,
|
|
"num_tokens": 3973565.0,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"entropy": 5.974027013778686,
|
|
"epoch": 1.7750783699059562,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00048727279000480226,
|
|
"loss": 5.9283,
|
|
"mean_token_accuracy": 0.14056170955300332,
|
|
"num_tokens": 3982683.0,
|
|
"step": 2265
|
|
},
|
|
{
|
|
"entropy": 6.065245008468628,
|
|
"epoch": 1.7789968652037618,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00048717287057589454,
|
|
"loss": 5.8374,
|
|
"mean_token_accuracy": 0.14206577837467194,
|
|
"num_tokens": 3991462.0,
|
|
"step": 2270
|
|
},
|
|
{
|
|
"entropy": 5.725625848770141,
|
|
"epoch": 1.7829153605015673,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004870725719596648,
|
|
"loss": 5.6081,
|
|
"mean_token_accuracy": 0.15651399940252303,
|
|
"num_tokens": 3999821.0,
|
|
"step": 2275
|
|
},
|
|
{
|
|
"entropy": 5.943557548522949,
|
|
"epoch": 1.786833855799373,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004869718943353631,
|
|
"loss": 5.8541,
|
|
"mean_token_accuracy": 0.14041255488991738,
|
|
"num_tokens": 4008655.0,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"entropy": 5.972759962081909,
|
|
"epoch": 1.7907523510971788,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00048687083788291656,
|
|
"loss": 5.8939,
|
|
"mean_token_accuracy": 0.14818500876426696,
|
|
"num_tokens": 4017317.0,
|
|
"step": 2285
|
|
},
|
|
{
|
|
"entropy": 5.872368335723877,
|
|
"epoch": 1.7946708463949843,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00048676940278292953,
|
|
"loss": 5.7282,
|
|
"mean_token_accuracy": 0.14405218958854676,
|
|
"num_tokens": 4025407.0,
|
|
"step": 2290
|
|
},
|
|
{
|
|
"entropy": 5.908810663223266,
|
|
"epoch": 1.7985893416927898,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00048666758921668286,
|
|
"loss": 5.7369,
|
|
"mean_token_accuracy": 0.14384883195161818,
|
|
"num_tokens": 4034310.0,
|
|
"step": 2295
|
|
},
|
|
{
|
|
"entropy": 5.879950332641601,
|
|
"epoch": 1.8025078369905956,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00048656539736613403,
|
|
"loss": 5.8969,
|
|
"mean_token_accuracy": 0.1422630712389946,
|
|
"num_tokens": 4042326.0,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"entropy": 5.854609394073487,
|
|
"epoch": 1.8064263322884013,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004864628274139164,
|
|
"loss": 5.7595,
|
|
"mean_token_accuracy": 0.1504887729883194,
|
|
"num_tokens": 4051248.0,
|
|
"step": 2305
|
|
},
|
|
{
|
|
"entropy": 5.929189538955688,
|
|
"epoch": 1.8103448275862069,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004863598795433391,
|
|
"loss": 5.8763,
|
|
"mean_token_accuracy": 0.1436061829328537,
|
|
"num_tokens": 4060227.0,
|
|
"step": 2310
|
|
},
|
|
{
|
|
"entropy": 6.035856866836548,
|
|
"epoch": 1.8142633228840124,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00048625655393838666,
|
|
"loss": 5.9747,
|
|
"mean_token_accuracy": 0.1343390792608261,
|
|
"num_tokens": 4069700.0,
|
|
"step": 2315
|
|
},
|
|
{
|
|
"entropy": 5.9489781856536865,
|
|
"epoch": 1.8181818181818183,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004861528507837186,
|
|
"loss": 5.8364,
|
|
"mean_token_accuracy": 0.14606306403875352,
|
|
"num_tokens": 4079363.0,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"entropy": 6.038274621963501,
|
|
"epoch": 1.8221003134796239,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004860487702646695,
|
|
"loss": 5.9136,
|
|
"mean_token_accuracy": 0.14080933034420012,
|
|
"num_tokens": 4087944.0,
|
|
"step": 2325
|
|
},
|
|
{
|
|
"entropy": 5.765271520614624,
|
|
"epoch": 1.8260188087774294,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004859443125672479,
|
|
"loss": 5.6074,
|
|
"mean_token_accuracy": 0.16549111306667327,
|
|
"num_tokens": 4096431.0,
|
|
"step": 2330
|
|
},
|
|
{
|
|
"entropy": 5.862937021255493,
|
|
"epoch": 1.8299373040752351,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004858394778781368,
|
|
"loss": 5.7839,
|
|
"mean_token_accuracy": 0.1452955462038517,
|
|
"num_tokens": 4104894.0,
|
|
"step": 2335
|
|
},
|
|
{
|
|
"entropy": 5.929635095596313,
|
|
"epoch": 1.8338557993730409,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004857342663846927,
|
|
"loss": 5.8308,
|
|
"mean_token_accuracy": 0.1402455188333988,
|
|
"num_tokens": 4112878.0,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"entropy": 5.906289291381836,
|
|
"epoch": 1.8377742946708464,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004856286782749456,
|
|
"loss": 5.839,
|
|
"mean_token_accuracy": 0.14992391616106032,
|
|
"num_tokens": 4122501.0,
|
|
"step": 2345
|
|
},
|
|
{
|
|
"entropy": 5.893653440475464,
|
|
"epoch": 1.841692789968652,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004855227137375986,
|
|
"loss": 5.8018,
|
|
"mean_token_accuracy": 0.13767838329076768,
|
|
"num_tokens": 4131301.0,
|
|
"step": 2350
|
|
},
|
|
{
|
|
"entropy": 5.921551322937011,
|
|
"epoch": 1.8456112852664577,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004854163729620275,
|
|
"loss": 5.8349,
|
|
"mean_token_accuracy": 0.14852394610643388,
|
|
"num_tokens": 4140076.0,
|
|
"step": 2355
|
|
},
|
|
{
|
|
"entropy": 5.864505481719971,
|
|
"epoch": 1.8495297805642634,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004853096561382805,
|
|
"loss": 5.7663,
|
|
"mean_token_accuracy": 0.15288633555173875,
|
|
"num_tokens": 4148728.0,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"entropy": 5.914716482162476,
|
|
"epoch": 1.853448275862069,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004852025634570779,
|
|
"loss": 5.9037,
|
|
"mean_token_accuracy": 0.14291643872857093,
|
|
"num_tokens": 4157520.0,
|
|
"step": 2365
|
|
},
|
|
{
|
|
"entropy": 5.905571317672729,
|
|
"epoch": 1.8573667711598745,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004850950951098116,
|
|
"loss": 5.7141,
|
|
"mean_token_accuracy": 0.15238296538591384,
|
|
"num_tokens": 4165547.0,
|
|
"step": 2370
|
|
},
|
|
{
|
|
"entropy": 5.928301191329956,
|
|
"epoch": 1.8612852664576802,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004849872512885451,
|
|
"loss": 5.8628,
|
|
"mean_token_accuracy": 0.1383912220597267,
|
|
"num_tokens": 4174656.0,
|
|
"step": 2375
|
|
},
|
|
{
|
|
"entropy": 5.904009199142456,
|
|
"epoch": 1.865203761755486,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004848790321860127,
|
|
"loss": 5.9161,
|
|
"mean_token_accuracy": 0.1413537159562111,
|
|
"num_tokens": 4183452.0,
|
|
"step": 2380
|
|
},
|
|
{
|
|
"entropy": 5.909180545806885,
|
|
"epoch": 1.8691222570532915,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00048477043799561946,
|
|
"loss": 5.7966,
|
|
"mean_token_accuracy": 0.14063763692975045,
|
|
"num_tokens": 4192306.0,
|
|
"step": 2385
|
|
},
|
|
{
|
|
"entropy": 5.8980663299560545,
|
|
"epoch": 1.873040752351097,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004846614689114409,
|
|
"loss": 5.8643,
|
|
"mean_token_accuracy": 0.14282563477754592,
|
|
"num_tokens": 4201201.0,
|
|
"step": 2390
|
|
},
|
|
{
|
|
"entropy": 5.890587997436524,
|
|
"epoch": 1.876959247648903,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004845521251282223,
|
|
"loss": 5.7437,
|
|
"mean_token_accuracy": 0.15185199975967406,
|
|
"num_tokens": 4209374.0,
|
|
"step": 2395
|
|
},
|
|
{
|
|
"entropy": 5.809947061538696,
|
|
"epoch": 1.8808777429467085,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004844424068413789,
|
|
"loss": 5.7215,
|
|
"mean_token_accuracy": 0.1469734065234661,
|
|
"num_tokens": 4217258.0,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"entropy": 5.848538589477539,
|
|
"epoch": 1.884796238244514,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00048433231424699504,
|
|
"loss": 5.7551,
|
|
"mean_token_accuracy": 0.1480185203254223,
|
|
"num_tokens": 4226484.0,
|
|
"step": 2405
|
|
},
|
|
{
|
|
"entropy": 5.924857568740845,
|
|
"epoch": 1.8887147335423198,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00048422184754182384,
|
|
"loss": 5.7889,
|
|
"mean_token_accuracy": 0.14311063811182975,
|
|
"num_tokens": 4235546.0,
|
|
"step": 2410
|
|
},
|
|
{
|
|
"entropy": 5.950683832168579,
|
|
"epoch": 1.8926332288401255,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004841110069232875,
|
|
"loss": 5.9386,
|
|
"mean_token_accuracy": 0.1451379805803299,
|
|
"num_tokens": 4244549.0,
|
|
"step": 2415
|
|
},
|
|
{
|
|
"entropy": 5.900457239151001,
|
|
"epoch": 1.896551724137931,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00048399979258947597,
|
|
"loss": 5.8331,
|
|
"mean_token_accuracy": 0.1464390404522419,
|
|
"num_tokens": 4252918.0,
|
|
"step": 2420
|
|
},
|
|
{
|
|
"entropy": 5.9453812599182125,
|
|
"epoch": 1.9004702194357366,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004838882047391474,
|
|
"loss": 5.8327,
|
|
"mean_token_accuracy": 0.14703057184815407,
|
|
"num_tokens": 4261674.0,
|
|
"step": 2425
|
|
},
|
|
{
|
|
"entropy": 5.875091028213501,
|
|
"epoch": 1.9043887147335423,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00048377624357172724,
|
|
"loss": 5.7425,
|
|
"mean_token_accuracy": 0.14618095010519028,
|
|
"num_tokens": 4269986.0,
|
|
"step": 2430
|
|
},
|
|
{
|
|
"entropy": 5.890668630599976,
|
|
"epoch": 1.908307210031348,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00048366390928730843,
|
|
"loss": 5.8377,
|
|
"mean_token_accuracy": 0.14733434692025185,
|
|
"num_tokens": 4278131.0,
|
|
"step": 2435
|
|
},
|
|
{
|
|
"entropy": 5.966364622116089,
|
|
"epoch": 1.9122257053291536,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004835512020866504,
|
|
"loss": 5.8243,
|
|
"mean_token_accuracy": 0.14588867947459222,
|
|
"num_tokens": 4287214.0,
|
|
"step": 2440
|
|
},
|
|
{
|
|
"entropy": 5.893796443939209,
|
|
"epoch": 1.9161442006269591,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00048343812217117925,
|
|
"loss": 5.7504,
|
|
"mean_token_accuracy": 0.1461929127573967,
|
|
"num_tokens": 4295949.0,
|
|
"step": 2445
|
|
},
|
|
{
|
|
"entropy": 5.786278104782104,
|
|
"epoch": 1.9200626959247649,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00048332466974298723,
|
|
"loss": 5.7543,
|
|
"mean_token_accuracy": 0.1465558797121048,
|
|
"num_tokens": 4305014.0,
|
|
"step": 2450
|
|
},
|
|
{
|
|
"entropy": 5.851295328140258,
|
|
"epoch": 1.9239811912225706,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00048321084500483203,
|
|
"loss": 5.7265,
|
|
"mean_token_accuracy": 0.14898128807544708,
|
|
"num_tokens": 4313560.0,
|
|
"step": 2455
|
|
},
|
|
{
|
|
"entropy": 5.87207818031311,
|
|
"epoch": 1.9278996865203761,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000483096648160137,
|
|
"loss": 5.8056,
|
|
"mean_token_accuracy": 0.1454322248697281,
|
|
"num_tokens": 4323280.0,
|
|
"step": 2460
|
|
},
|
|
{
|
|
"entropy": 5.872972869873047,
|
|
"epoch": 1.9318181818181817,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00048298207941299047,
|
|
"loss": 5.861,
|
|
"mean_token_accuracy": 0.14596157446503638,
|
|
"num_tokens": 4331757.0,
|
|
"step": 2465
|
|
},
|
|
{
|
|
"entropy": 5.8036915302276615,
|
|
"epoch": 1.9357366771159876,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00048286713896814536,
|
|
"loss": 5.7876,
|
|
"mean_token_accuracy": 0.14330902695655823,
|
|
"num_tokens": 4341453.0,
|
|
"step": 2470
|
|
},
|
|
{
|
|
"entropy": 6.028292417526245,
|
|
"epoch": 1.9396551724137931,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00048275182703101877,
|
|
"loss": 5.7431,
|
|
"mean_token_accuracy": 0.14791915863752364,
|
|
"num_tokens": 4349095.0,
|
|
"step": 2475
|
|
},
|
|
{
|
|
"entropy": 5.750636386871338,
|
|
"epoch": 1.9435736677115987,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00048263614380769193,
|
|
"loss": 5.8404,
|
|
"mean_token_accuracy": 0.14387739300727845,
|
|
"num_tokens": 4357913.0,
|
|
"step": 2480
|
|
},
|
|
{
|
|
"entropy": 5.927936792373657,
|
|
"epoch": 1.9474921630094044,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00048252008950490957,
|
|
"loss": 5.8083,
|
|
"mean_token_accuracy": 0.14284604787826538,
|
|
"num_tokens": 4366586.0,
|
|
"step": 2485
|
|
},
|
|
{
|
|
"entropy": 5.8729980945587155,
|
|
"epoch": 1.9514106583072102,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00048240366433007935,
|
|
"loss": 5.7307,
|
|
"mean_token_accuracy": 0.15130855292081832,
|
|
"num_tokens": 4375393.0,
|
|
"step": 2490
|
|
},
|
|
{
|
|
"entropy": 5.848481798171997,
|
|
"epoch": 1.9553291536050157,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00048228686849127213,
|
|
"loss": 5.8159,
|
|
"mean_token_accuracy": 0.15051234513521194,
|
|
"num_tokens": 4383747.0,
|
|
"step": 2495
|
|
},
|
|
{
|
|
"entropy": 5.987670612335205,
|
|
"epoch": 1.9592476489028212,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004821697021972209,
|
|
"loss": 5.7804,
|
|
"mean_token_accuracy": 0.14112372547388077,
|
|
"num_tokens": 4392085.0,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"epoch": 1.9592476489028212,
|
|
"eval_entropy": 5.819716327874235,
|
|
"eval_loss": 5.9731364250183105,
|
|
"eval_mean_token_accuracy": 0.14396101977791667,
|
|
"eval_num_tokens": 4392085.0,
|
|
"eval_runtime": 2.8283,
|
|
"eval_samples_per_second": 1457.403,
|
|
"eval_steps_per_second": 182.441,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"entropy": 5.882195997238159,
|
|
"epoch": 1.963166144200627,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004820521656573208,
|
|
"loss": 5.8043,
|
|
"mean_token_accuracy": 0.14567812159657478,
|
|
"num_tokens": 4400843.0,
|
|
"step": 2505
|
|
},
|
|
{
|
|
"entropy": 5.773218107223511,
|
|
"epoch": 1.9670846394984327,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004819342590816288,
|
|
"loss": 5.7471,
|
|
"mean_token_accuracy": 0.14761917144060135,
|
|
"num_tokens": 4409126.0,
|
|
"step": 2510
|
|
},
|
|
{
|
|
"entropy": 5.89848108291626,
|
|
"epoch": 1.9710031347962382,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004818159826808631,
|
|
"loss": 5.9097,
|
|
"mean_token_accuracy": 0.1417311027646065,
|
|
"num_tokens": 4418401.0,
|
|
"step": 2515
|
|
},
|
|
{
|
|
"entropy": 5.897497129440308,
|
|
"epoch": 1.9749216300940438,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004816973366664026,
|
|
"loss": 5.811,
|
|
"mean_token_accuracy": 0.1456060990691185,
|
|
"num_tokens": 4428141.0,
|
|
"step": 2520
|
|
},
|
|
{
|
|
"entropy": 5.986608266830444,
|
|
"epoch": 1.9788401253918495,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004815783212502871,
|
|
"loss": 5.828,
|
|
"mean_token_accuracy": 0.1428292214870453,
|
|
"num_tokens": 4437158.0,
|
|
"step": 2525
|
|
},
|
|
{
|
|
"entropy": 5.895078134536743,
|
|
"epoch": 1.9827586206896552,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00048145893664521645,
|
|
"loss": 5.8973,
|
|
"mean_token_accuracy": 0.13774282485246658,
|
|
"num_tokens": 4445901.0,
|
|
"step": 2530
|
|
},
|
|
{
|
|
"entropy": 5.878189134597778,
|
|
"epoch": 1.9866771159874608,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00048133918306455023,
|
|
"loss": 5.7054,
|
|
"mean_token_accuracy": 0.15397633910179137,
|
|
"num_tokens": 4454200.0,
|
|
"step": 2535
|
|
},
|
|
{
|
|
"entropy": 5.8978513240814205,
|
|
"epoch": 1.9905956112852663,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004812190607223075,
|
|
"loss": 5.8244,
|
|
"mean_token_accuracy": 0.1423790991306305,
|
|
"num_tokens": 4463484.0,
|
|
"step": 2540
|
|
},
|
|
{
|
|
"entropy": 5.743094491958618,
|
|
"epoch": 1.9945141065830723,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00048109856983316655,
|
|
"loss": 5.6436,
|
|
"mean_token_accuracy": 0.1538312703371048,
|
|
"num_tokens": 4472659.0,
|
|
"step": 2545
|
|
},
|
|
{
|
|
"entropy": 5.894764852523804,
|
|
"epoch": 1.9984326018808778,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.000480977710612464,
|
|
"loss": 5.7289,
|
|
"mean_token_accuracy": 0.1462229423224926,
|
|
"num_tokens": 4481355.0,
|
|
"step": 2550
|
|
},
|
|
{
|
|
"entropy": 5.841469097137451,
|
|
"epoch": 2.0023510971786833,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004808564832761948,
|
|
"loss": 5.5879,
|
|
"mean_token_accuracy": 0.1482195809483528,
|
|
"num_tokens": 4490005.0,
|
|
"step": 2555
|
|
},
|
|
{
|
|
"entropy": 5.781640338897705,
|
|
"epoch": 2.006269592476489,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004807348880410119,
|
|
"loss": 5.3827,
|
|
"mean_token_accuracy": 0.1616608127951622,
|
|
"num_tokens": 4498645.0,
|
|
"step": 2560
|
|
},
|
|
{
|
|
"entropy": 5.733411645889282,
|
|
"epoch": 2.010188087774295,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004806129251242258,
|
|
"loss": 5.5173,
|
|
"mean_token_accuracy": 0.15624455660581588,
|
|
"num_tokens": 4509675.0,
|
|
"step": 2565
|
|
},
|
|
{
|
|
"entropy": 5.799925708770752,
|
|
"epoch": 2.0141065830721003,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00048049059474380393,
|
|
"loss": 5.3307,
|
|
"mean_token_accuracy": 0.1625298425555229,
|
|
"num_tokens": 4518463.0,
|
|
"step": 2570
|
|
},
|
|
{
|
|
"entropy": 5.857757616043091,
|
|
"epoch": 2.018025078369906,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00048036789711837047,
|
|
"loss": 5.4816,
|
|
"mean_token_accuracy": 0.15710628032684326,
|
|
"num_tokens": 4527120.0,
|
|
"step": 2575
|
|
},
|
|
{
|
|
"entropy": 5.79098744392395,
|
|
"epoch": 2.0219435736677114,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00048024483246720607,
|
|
"loss": 5.5141,
|
|
"mean_token_accuracy": 0.1521046057343483,
|
|
"num_tokens": 4535962.0,
|
|
"step": 2580
|
|
},
|
|
{
|
|
"entropy": 5.795133399963379,
|
|
"epoch": 2.0258620689655173,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004801214010102472,
|
|
"loss": 5.4748,
|
|
"mean_token_accuracy": 0.15003894567489623,
|
|
"num_tokens": 4545233.0,
|
|
"step": 2585
|
|
},
|
|
{
|
|
"entropy": 5.720596790313721,
|
|
"epoch": 2.029780564263323,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004799976029680858,
|
|
"loss": 5.351,
|
|
"mean_token_accuracy": 0.1570914052426815,
|
|
"num_tokens": 4554658.0,
|
|
"step": 2590
|
|
},
|
|
{
|
|
"entropy": 5.793403244018554,
|
|
"epoch": 2.0336990595611284,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004798734385619691,
|
|
"loss": 5.5496,
|
|
"mean_token_accuracy": 0.1535790517926216,
|
|
"num_tokens": 4563964.0,
|
|
"step": 2595
|
|
},
|
|
{
|
|
"entropy": 5.807430982589722,
|
|
"epoch": 2.0376175548589344,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000479748908013799,
|
|
"loss": 5.4134,
|
|
"mean_token_accuracy": 0.16276097446680068,
|
|
"num_tokens": 4572865.0,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"entropy": 5.672877264022827,
|
|
"epoch": 2.04153605015674,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004796240115461319,
|
|
"loss": 5.3736,
|
|
"mean_token_accuracy": 0.17069609314203263,
|
|
"num_tokens": 4581021.0,
|
|
"step": 2605
|
|
},
|
|
{
|
|
"entropy": 5.710117959976197,
|
|
"epoch": 2.0454545454545454,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004794987493821779,
|
|
"loss": 5.4687,
|
|
"mean_token_accuracy": 0.15729401111602784,
|
|
"num_tokens": 4590779.0,
|
|
"step": 2610
|
|
},
|
|
{
|
|
"entropy": 5.794814777374268,
|
|
"epoch": 2.049373040752351,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00047937312174580084,
|
|
"loss": 5.4478,
|
|
"mean_token_accuracy": 0.1574981316924095,
|
|
"num_tokens": 4599831.0,
|
|
"step": 2615
|
|
},
|
|
{
|
|
"entropy": 5.857023143768311,
|
|
"epoch": 2.053291536050157,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004792471288615177,
|
|
"loss": 5.5945,
|
|
"mean_token_accuracy": 0.1500529244542122,
|
|
"num_tokens": 4609264.0,
|
|
"step": 2620
|
|
},
|
|
{
|
|
"entropy": 5.8015649795532225,
|
|
"epoch": 2.0572100313479624,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004791207709544981,
|
|
"loss": 5.5107,
|
|
"mean_token_accuracy": 0.15906523615121843,
|
|
"num_tokens": 4617885.0,
|
|
"step": 2625
|
|
},
|
|
{
|
|
"entropy": 5.6719653606414795,
|
|
"epoch": 2.061128526645768,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00047899404825056424,
|
|
"loss": 5.4279,
|
|
"mean_token_accuracy": 0.15619982779026031,
|
|
"num_tokens": 4626536.0,
|
|
"step": 2630
|
|
},
|
|
{
|
|
"entropy": 5.770504665374756,
|
|
"epoch": 2.0650470219435735,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004788669609761901,
|
|
"loss": 5.5118,
|
|
"mean_token_accuracy": 0.1544651284813881,
|
|
"num_tokens": 4635563.0,
|
|
"step": 2635
|
|
},
|
|
{
|
|
"entropy": 5.720648860931396,
|
|
"epoch": 2.0689655172413794,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00047873950935850107,
|
|
"loss": 5.4414,
|
|
"mean_token_accuracy": 0.1552010580897331,
|
|
"num_tokens": 4644822.0,
|
|
"step": 2640
|
|
},
|
|
{
|
|
"entropy": 5.75403094291687,
|
|
"epoch": 2.072884012539185,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004786116936252742,
|
|
"loss": 5.4081,
|
|
"mean_token_accuracy": 0.1630728781223297,
|
|
"num_tokens": 4652709.0,
|
|
"step": 2645
|
|
},
|
|
{
|
|
"entropy": 5.651195621490478,
|
|
"epoch": 2.0768025078369905,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004784835140049367,
|
|
"loss": 5.447,
|
|
"mean_token_accuracy": 0.15986524671316146,
|
|
"num_tokens": 4662014.0,
|
|
"step": 2650
|
|
},
|
|
{
|
|
"entropy": 5.7658521175384525,
|
|
"epoch": 2.080721003134796,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004783549707265663,
|
|
"loss": 5.4033,
|
|
"mean_token_accuracy": 0.15730245560407638,
|
|
"num_tokens": 4670618.0,
|
|
"step": 2655
|
|
},
|
|
{
|
|
"entropy": 5.7565501689910885,
|
|
"epoch": 2.084639498432602,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00047822606401989084,
|
|
"loss": 5.5384,
|
|
"mean_token_accuracy": 0.14618516638875007,
|
|
"num_tokens": 4679937.0,
|
|
"step": 2660
|
|
},
|
|
{
|
|
"entropy": 5.747328805923462,
|
|
"epoch": 2.0885579937304075,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004780967941152873,
|
|
"loss": 5.3716,
|
|
"mean_token_accuracy": 0.16848112791776657,
|
|
"num_tokens": 4688006.0,
|
|
"step": 2665
|
|
},
|
|
{
|
|
"entropy": 5.598170948028565,
|
|
"epoch": 2.092476489028213,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004779671612437822,
|
|
"loss": 5.3212,
|
|
"mean_token_accuracy": 0.16888306885957718,
|
|
"num_tokens": 4697106.0,
|
|
"step": 2670
|
|
},
|
|
{
|
|
"entropy": 5.709634304046631,
|
|
"epoch": 2.096394984326019,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00047783716563705035,
|
|
"loss": 5.4699,
|
|
"mean_token_accuracy": 0.15956653356552125,
|
|
"num_tokens": 4705582.0,
|
|
"step": 2675
|
|
},
|
|
{
|
|
"entropy": 5.789704513549805,
|
|
"epoch": 2.1003134796238245,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.000477706807527415,
|
|
"loss": 5.4032,
|
|
"mean_token_accuracy": 0.1633769229054451,
|
|
"num_tokens": 4713640.0,
|
|
"step": 2680
|
|
},
|
|
{
|
|
"entropy": 5.7358156681060795,
|
|
"epoch": 2.10423197492163,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004775760871478472,
|
|
"loss": 5.5193,
|
|
"mean_token_accuracy": 0.1518564686179161,
|
|
"num_tokens": 4722729.0,
|
|
"step": 2685
|
|
},
|
|
{
|
|
"entropy": 5.710189342498779,
|
|
"epoch": 2.1081504702194356,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00047744500473196564,
|
|
"loss": 5.4226,
|
|
"mean_token_accuracy": 0.16306858509778976,
|
|
"num_tokens": 4732099.0,
|
|
"step": 2690
|
|
},
|
|
{
|
|
"entropy": 5.778943872451782,
|
|
"epoch": 2.1120689655172415,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00047731356051403556,
|
|
"loss": 5.4259,
|
|
"mean_token_accuracy": 0.15491524189710618,
|
|
"num_tokens": 4741766.0,
|
|
"step": 2695
|
|
},
|
|
{
|
|
"entropy": 5.550606107711792,
|
|
"epoch": 2.115987460815047,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004771817547289693,
|
|
"loss": 5.3501,
|
|
"mean_token_accuracy": 0.16072812229394912,
|
|
"num_tokens": 4750890.0,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"entropy": 5.691572904586792,
|
|
"epoch": 2.1199059561128526,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004770495876123251,
|
|
"loss": 5.4713,
|
|
"mean_token_accuracy": 0.15760626047849655,
|
|
"num_tokens": 4760063.0,
|
|
"step": 2705
|
|
},
|
|
{
|
|
"entropy": 5.7919927597045895,
|
|
"epoch": 2.123824451410658,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004769170594003071,
|
|
"loss": 5.4737,
|
|
"mean_token_accuracy": 0.16406295895576478,
|
|
"num_tokens": 4768928.0,
|
|
"step": 2710
|
|
},
|
|
{
|
|
"entropy": 5.65070915222168,
|
|
"epoch": 2.127742946708464,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00047678417032976457,
|
|
"loss": 5.4532,
|
|
"mean_token_accuracy": 0.15579911768436433,
|
|
"num_tokens": 4778129.0,
|
|
"step": 2715
|
|
},
|
|
{
|
|
"entropy": 5.69934196472168,
|
|
"epoch": 2.1316614420062696,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004766509206381919,
|
|
"loss": 5.4479,
|
|
"mean_token_accuracy": 0.1599138170480728,
|
|
"num_tokens": 4786759.0,
|
|
"step": 2720
|
|
},
|
|
{
|
|
"entropy": 5.6193219184875485,
|
|
"epoch": 2.135579937304075,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004765173105637279,
|
|
"loss": 5.4547,
|
|
"mean_token_accuracy": 0.16395413279533386,
|
|
"num_tokens": 4795355.0,
|
|
"step": 2725
|
|
},
|
|
{
|
|
"entropy": 5.696455669403076,
|
|
"epoch": 2.139498432601881,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00047638334034515547,
|
|
"loss": 5.3964,
|
|
"mean_token_accuracy": 0.15683644711971284,
|
|
"num_tokens": 4804030.0,
|
|
"step": 2730
|
|
},
|
|
{
|
|
"entropy": 5.580397653579712,
|
|
"epoch": 2.1434169278996866,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00047624901022190106,
|
|
"loss": 5.3523,
|
|
"mean_token_accuracy": 0.17931961715221406,
|
|
"num_tokens": 4812779.0,
|
|
"step": 2735
|
|
},
|
|
{
|
|
"entropy": 5.7006062984466555,
|
|
"epoch": 2.147335423197492,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00047611432043403437,
|
|
"loss": 5.4629,
|
|
"mean_token_accuracy": 0.16019529104232788,
|
|
"num_tokens": 4822292.0,
|
|
"step": 2740
|
|
},
|
|
{
|
|
"entropy": 5.834177780151367,
|
|
"epoch": 2.1512539184952977,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004759792712222679,
|
|
"loss": 5.535,
|
|
"mean_token_accuracy": 0.15739217698574065,
|
|
"num_tokens": 4830524.0,
|
|
"step": 2745
|
|
},
|
|
{
|
|
"entropy": 5.8059389114379885,
|
|
"epoch": 2.1551724137931036,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004758438628279565,
|
|
"loss": 5.5164,
|
|
"mean_token_accuracy": 0.15928612202405928,
|
|
"num_tokens": 4839328.0,
|
|
"step": 2750
|
|
},
|
|
{
|
|
"entropy": 5.638897466659546,
|
|
"epoch": 2.159090909090909,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00047570809549309697,
|
|
"loss": 5.3591,
|
|
"mean_token_accuracy": 0.1710207626223564,
|
|
"num_tokens": 4847711.0,
|
|
"step": 2755
|
|
},
|
|
{
|
|
"entropy": 5.774710035324096,
|
|
"epoch": 2.1630094043887147,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004755719694603275,
|
|
"loss": 5.549,
|
|
"mean_token_accuracy": 0.15182463973760604,
|
|
"num_tokens": 4856706.0,
|
|
"step": 2760
|
|
},
|
|
{
|
|
"entropy": 5.728883123397827,
|
|
"epoch": 2.16692789968652,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004754354849729274,
|
|
"loss": 5.5423,
|
|
"mean_token_accuracy": 0.15759230852127076,
|
|
"num_tokens": 4865250.0,
|
|
"step": 2765
|
|
},
|
|
{
|
|
"entropy": 5.702935409545899,
|
|
"epoch": 2.170846394984326,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00047529864227481653,
|
|
"loss": 5.4631,
|
|
"mean_token_accuracy": 0.15977989733219147,
|
|
"num_tokens": 4873826.0,
|
|
"step": 2770
|
|
},
|
|
{
|
|
"entropy": 5.781952810287476,
|
|
"epoch": 2.1747648902821317,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000475161441610555,
|
|
"loss": 5.44,
|
|
"mean_token_accuracy": 0.16142217814922333,
|
|
"num_tokens": 4882631.0,
|
|
"step": 2775
|
|
},
|
|
{
|
|
"entropy": 5.665818929672241,
|
|
"epoch": 2.1786833855799372,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004750238832253427,
|
|
"loss": 5.4775,
|
|
"mean_token_accuracy": 0.15787807181477548,
|
|
"num_tokens": 4891007.0,
|
|
"step": 2780
|
|
},
|
|
{
|
|
"entropy": 5.697315788269043,
|
|
"epoch": 2.1826018808777428,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004748859673650187,
|
|
"loss": 5.386,
|
|
"mean_token_accuracy": 0.16155248284339904,
|
|
"num_tokens": 4899692.0,
|
|
"step": 2785
|
|
},
|
|
{
|
|
"entropy": 5.658601570129394,
|
|
"epoch": 2.1865203761755487,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00047474769427606115,
|
|
"loss": 5.5095,
|
|
"mean_token_accuracy": 0.15856993645429612,
|
|
"num_tokens": 4909354.0,
|
|
"step": 2790
|
|
},
|
|
{
|
|
"entropy": 5.6818701267242435,
|
|
"epoch": 2.1904388714733543,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004746090642055863,
|
|
"loss": 5.4671,
|
|
"mean_token_accuracy": 0.1562755212187767,
|
|
"num_tokens": 4918400.0,
|
|
"step": 2795
|
|
},
|
|
{
|
|
"entropy": 5.739003992080688,
|
|
"epoch": 2.19435736677116,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00047447007740134857,
|
|
"loss": 5.5467,
|
|
"mean_token_accuracy": 0.14892319440841675,
|
|
"num_tokens": 4926852.0,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"entropy": 5.7248610496521,
|
|
"epoch": 2.1982758620689653,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00047433073411174,
|
|
"loss": 5.4509,
|
|
"mean_token_accuracy": 0.15806010216474534,
|
|
"num_tokens": 4935717.0,
|
|
"step": 2805
|
|
},
|
|
{
|
|
"entropy": 5.591716861724853,
|
|
"epoch": 2.2021943573667713,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004741910345857896,
|
|
"loss": 5.3813,
|
|
"mean_token_accuracy": 0.16058638840913772,
|
|
"num_tokens": 4945389.0,
|
|
"step": 2810
|
|
},
|
|
{
|
|
"entropy": 5.752324533462525,
|
|
"epoch": 2.206112852664577,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00047405097907316315,
|
|
"loss": 5.5201,
|
|
"mean_token_accuracy": 0.1630863979458809,
|
|
"num_tokens": 4953670.0,
|
|
"step": 2815
|
|
},
|
|
{
|
|
"entropy": 5.657805633544922,
|
|
"epoch": 2.2100313479623823,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004739105678241625,
|
|
"loss": 5.5289,
|
|
"mean_token_accuracy": 0.15918288826942445,
|
|
"num_tokens": 4962383.0,
|
|
"step": 2820
|
|
},
|
|
{
|
|
"entropy": 5.662711334228516,
|
|
"epoch": 2.2139498432601883,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004737698010897253,
|
|
"loss": 5.3295,
|
|
"mean_token_accuracy": 0.16085091829299927,
|
|
"num_tokens": 4970672.0,
|
|
"step": 2825
|
|
},
|
|
{
|
|
"entropy": 5.757535028457641,
|
|
"epoch": 2.217868338557994,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004736286791214245,
|
|
"loss": 5.5162,
|
|
"mean_token_accuracy": 0.15798387676477432,
|
|
"num_tokens": 4979303.0,
|
|
"step": 2830
|
|
},
|
|
{
|
|
"entropy": 5.72713942527771,
|
|
"epoch": 2.2217868338557993,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00047348720217146807,
|
|
"loss": 5.3865,
|
|
"mean_token_accuracy": 0.1653503268957138,
|
|
"num_tokens": 4988322.0,
|
|
"step": 2835
|
|
},
|
|
{
|
|
"entropy": 5.582199764251709,
|
|
"epoch": 2.225705329153605,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00047334537049269806,
|
|
"loss": 5.4754,
|
|
"mean_token_accuracy": 0.15811690539121628,
|
|
"num_tokens": 4997250.0,
|
|
"step": 2840
|
|
},
|
|
{
|
|
"entropy": 5.754626846313476,
|
|
"epoch": 2.229623824451411,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004732031843385909,
|
|
"loss": 5.5191,
|
|
"mean_token_accuracy": 0.151338791847229,
|
|
"num_tokens": 5005459.0,
|
|
"step": 2845
|
|
},
|
|
{
|
|
"entropy": 5.570264291763306,
|
|
"epoch": 2.2335423197492164,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004730606439632562,
|
|
"loss": 5.2819,
|
|
"mean_token_accuracy": 0.17884395718574525,
|
|
"num_tokens": 5013704.0,
|
|
"step": 2850
|
|
},
|
|
{
|
|
"entropy": 5.602528190612793,
|
|
"epoch": 2.237460815047022,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004729177496214367,
|
|
"loss": 5.3976,
|
|
"mean_token_accuracy": 0.16089559048414231,
|
|
"num_tokens": 5022545.0,
|
|
"step": 2855
|
|
},
|
|
{
|
|
"entropy": 5.619579839706421,
|
|
"epoch": 2.2413793103448274,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00047277450156850767,
|
|
"loss": 5.4026,
|
|
"mean_token_accuracy": 0.16599346250295638,
|
|
"num_tokens": 5031133.0,
|
|
"step": 2860
|
|
},
|
|
{
|
|
"entropy": 5.6934120655059814,
|
|
"epoch": 2.2452978056426334,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004726309000604768,
|
|
"loss": 5.4422,
|
|
"mean_token_accuracy": 0.15853513032197952,
|
|
"num_tokens": 5039424.0,
|
|
"step": 2865
|
|
},
|
|
{
|
|
"entropy": 5.7841479778289795,
|
|
"epoch": 2.249216300940439,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004724869453539832,
|
|
"loss": 5.6437,
|
|
"mean_token_accuracy": 0.15349650308489798,
|
|
"num_tokens": 5048868.0,
|
|
"step": 2870
|
|
},
|
|
{
|
|
"entropy": 5.743923711776733,
|
|
"epoch": 2.2531347962382444,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004723426377062972,
|
|
"loss": 5.4742,
|
|
"mean_token_accuracy": 0.15738968551158905,
|
|
"num_tokens": 5057532.0,
|
|
"step": 2875
|
|
},
|
|
{
|
|
"entropy": 5.752024412155151,
|
|
"epoch": 2.2570532915360504,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00047219797737532,
|
|
"loss": 5.4875,
|
|
"mean_token_accuracy": 0.15550234615802766,
|
|
"num_tokens": 5066088.0,
|
|
"step": 2880
|
|
},
|
|
{
|
|
"entropy": 5.5935587882995605,
|
|
"epoch": 2.260971786833856,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00047205296461958314,
|
|
"loss": 5.3623,
|
|
"mean_token_accuracy": 0.1605392187833786,
|
|
"num_tokens": 5074968.0,
|
|
"step": 2885
|
|
},
|
|
{
|
|
"entropy": 5.607887029647827,
|
|
"epoch": 2.2648902821316614,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00047190759969824785,
|
|
"loss": 5.4266,
|
|
"mean_token_accuracy": 0.1619516670703888,
|
|
"num_tokens": 5083605.0,
|
|
"step": 2890
|
|
},
|
|
{
|
|
"entropy": 5.665104103088379,
|
|
"epoch": 2.268808777429467,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00047176188287110485,
|
|
"loss": 5.4267,
|
|
"mean_token_accuracy": 0.16131291091442107,
|
|
"num_tokens": 5092182.0,
|
|
"step": 2895
|
|
},
|
|
{
|
|
"entropy": 5.784565830230713,
|
|
"epoch": 2.2727272727272725,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004716158143985737,
|
|
"loss": 5.5997,
|
|
"mean_token_accuracy": 0.15428409725427628,
|
|
"num_tokens": 5100910.0,
|
|
"step": 2900
|
|
},
|
|
{
|
|
"entropy": 5.705980825424194,
|
|
"epoch": 2.2766457680250785,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00047146939454170245,
|
|
"loss": 5.4627,
|
|
"mean_token_accuracy": 0.16400493532419205,
|
|
"num_tokens": 5109760.0,
|
|
"step": 2905
|
|
},
|
|
{
|
|
"entropy": 5.720902061462402,
|
|
"epoch": 2.280564263322884,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004713226235621672,
|
|
"loss": 5.4957,
|
|
"mean_token_accuracy": 0.15792311280965804,
|
|
"num_tokens": 5119193.0,
|
|
"step": 2910
|
|
},
|
|
{
|
|
"entropy": 5.61166467666626,
|
|
"epoch": 2.2844827586206895,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004711755017222714,
|
|
"loss": 5.4122,
|
|
"mean_token_accuracy": 0.1576576665043831,
|
|
"num_tokens": 5127480.0,
|
|
"step": 2915
|
|
},
|
|
{
|
|
"entropy": 5.743918991088867,
|
|
"epoch": 2.2884012539184955,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00047102802928494563,
|
|
"loss": 5.5857,
|
|
"mean_token_accuracy": 0.15232446938753127,
|
|
"num_tokens": 5136648.0,
|
|
"step": 2920
|
|
},
|
|
{
|
|
"entropy": 5.688281393051147,
|
|
"epoch": 2.292319749216301,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004708802065137471,
|
|
"loss": 5.3771,
|
|
"mean_token_accuracy": 0.16786309629678725,
|
|
"num_tokens": 5144708.0,
|
|
"step": 2925
|
|
},
|
|
{
|
|
"entropy": 5.643393802642822,
|
|
"epoch": 2.2962382445141065,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004707320336728591,
|
|
"loss": 5.5237,
|
|
"mean_token_accuracy": 0.15052489414811135,
|
|
"num_tokens": 5153388.0,
|
|
"step": 2930
|
|
},
|
|
{
|
|
"entropy": 5.797506046295166,
|
|
"epoch": 2.300156739811912,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004705835110270904,
|
|
"loss": 5.4824,
|
|
"mean_token_accuracy": 0.15724072754383087,
|
|
"num_tokens": 5161874.0,
|
|
"step": 2935
|
|
},
|
|
{
|
|
"entropy": 5.700078773498535,
|
|
"epoch": 2.304075235109718,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00047043463884187517,
|
|
"loss": 5.4791,
|
|
"mean_token_accuracy": 0.16156153082847596,
|
|
"num_tokens": 5170284.0,
|
|
"step": 2940
|
|
},
|
|
{
|
|
"entropy": 5.583881044387818,
|
|
"epoch": 2.3079937304075235,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00047028541738327207,
|
|
"loss": 5.44,
|
|
"mean_token_accuracy": 0.15695979446172714,
|
|
"num_tokens": 5179547.0,
|
|
"step": 2945
|
|
},
|
|
{
|
|
"entropy": 5.691956996917725,
|
|
"epoch": 2.311912225705329,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004701358469179641,
|
|
"loss": 5.4472,
|
|
"mean_token_accuracy": 0.16403508335351943,
|
|
"num_tokens": 5188256.0,
|
|
"step": 2950
|
|
},
|
|
{
|
|
"entropy": 5.756633996963501,
|
|
"epoch": 2.3158307210031346,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004699859277132578,
|
|
"loss": 5.5902,
|
|
"mean_token_accuracy": 0.16007334217429162,
|
|
"num_tokens": 5197464.0,
|
|
"step": 2955
|
|
},
|
|
{
|
|
"entropy": 5.734570837020874,
|
|
"epoch": 2.3197492163009406,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00046983566003708336,
|
|
"loss": 5.3792,
|
|
"mean_token_accuracy": 0.16429649144411088,
|
|
"num_tokens": 5205989.0,
|
|
"step": 2960
|
|
},
|
|
{
|
|
"entropy": 5.587563323974609,
|
|
"epoch": 2.323667711598746,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00046968504415799325,
|
|
"loss": 5.4346,
|
|
"mean_token_accuracy": 0.1603007957339287,
|
|
"num_tokens": 5214490.0,
|
|
"step": 2965
|
|
},
|
|
{
|
|
"entropy": 5.64924201965332,
|
|
"epoch": 2.3275862068965516,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004695340803451625,
|
|
"loss": 5.464,
|
|
"mean_token_accuracy": 0.16044287830591203,
|
|
"num_tokens": 5223335.0,
|
|
"step": 2970
|
|
},
|
|
{
|
|
"entropy": 5.629110479354859,
|
|
"epoch": 2.3315047021943576,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004693827688683879,
|
|
"loss": 5.4147,
|
|
"mean_token_accuracy": 0.15993678867816924,
|
|
"num_tokens": 5231665.0,
|
|
"step": 2975
|
|
},
|
|
{
|
|
"entropy": 5.594243144989013,
|
|
"epoch": 2.335423197492163,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004692311099980878,
|
|
"loss": 5.4005,
|
|
"mean_token_accuracy": 0.16494683176279068,
|
|
"num_tokens": 5239810.0,
|
|
"step": 2980
|
|
},
|
|
{
|
|
"entropy": 5.5794525146484375,
|
|
"epoch": 2.3393416927899686,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00046907910400530097,
|
|
"loss": 5.369,
|
|
"mean_token_accuracy": 0.16893674433231354,
|
|
"num_tokens": 5247525.0,
|
|
"step": 2985
|
|
},
|
|
{
|
|
"entropy": 5.594454145431518,
|
|
"epoch": 2.343260188087774,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004689267511616868,
|
|
"loss": 5.4189,
|
|
"mean_token_accuracy": 0.15772538781166076,
|
|
"num_tokens": 5255528.0,
|
|
"step": 2990
|
|
},
|
|
{
|
|
"entropy": 5.617967176437378,
|
|
"epoch": 2.34717868338558,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00046877405173952465,
|
|
"loss": 5.3633,
|
|
"mean_token_accuracy": 0.17105703949928283,
|
|
"num_tokens": 5263944.0,
|
|
"step": 2995
|
|
},
|
|
{
|
|
"entropy": 5.657676124572754,
|
|
"epoch": 2.3510971786833856,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.000468621006011713,
|
|
"loss": 5.4912,
|
|
"mean_token_accuracy": 0.15778652876615523,
|
|
"num_tokens": 5271724.0,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"epoch": 2.3510971786833856,
|
|
"eval_entropy": 5.565460778021997,
|
|
"eval_loss": 5.932183742523193,
|
|
"eval_mean_token_accuracy": 0.14874126017815614,
|
|
"eval_num_tokens": 5271724.0,
|
|
"eval_runtime": 2.835,
|
|
"eval_samples_per_second": 1453.963,
|
|
"eval_steps_per_second": 182.01,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"entropy": 5.714021825790406,
|
|
"epoch": 2.355015673981191,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00046846761425176943,
|
|
"loss": 5.5009,
|
|
"mean_token_accuracy": 0.16061849147081375,
|
|
"num_tokens": 5281199.0,
|
|
"step": 3005
|
|
},
|
|
{
|
|
"entropy": 5.739114904403687,
|
|
"epoch": 2.3589341692789967,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004683138767338299,
|
|
"loss": 5.5238,
|
|
"mean_token_accuracy": 0.15723237693309783,
|
|
"num_tokens": 5289782.0,
|
|
"step": 3010
|
|
},
|
|
{
|
|
"entropy": 5.584011459350586,
|
|
"epoch": 2.3628526645768027,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004681597937326483,
|
|
"loss": 5.3349,
|
|
"mean_token_accuracy": 0.16592346727848054,
|
|
"num_tokens": 5297922.0,
|
|
"step": 3015
|
|
},
|
|
{
|
|
"entropy": 5.6475341796875,
|
|
"epoch": 2.366771159874608,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004680053655235959,
|
|
"loss": 5.4798,
|
|
"mean_token_accuracy": 0.16429754048585893,
|
|
"num_tokens": 5306178.0,
|
|
"step": 3020
|
|
},
|
|
{
|
|
"entropy": 5.601258897781372,
|
|
"epoch": 2.3706896551724137,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004678505923826609,
|
|
"loss": 5.451,
|
|
"mean_token_accuracy": 0.16447941958904266,
|
|
"num_tokens": 5314936.0,
|
|
"step": 3025
|
|
},
|
|
{
|
|
"entropy": 5.6725733280181885,
|
|
"epoch": 2.3746081504702197,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00046769547458644817,
|
|
"loss": 5.4771,
|
|
"mean_token_accuracy": 0.1562672033905983,
|
|
"num_tokens": 5323549.0,
|
|
"step": 3030
|
|
},
|
|
{
|
|
"entropy": 5.661346340179444,
|
|
"epoch": 2.378526645768025,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004675400124121782,
|
|
"loss": 5.4679,
|
|
"mean_token_accuracy": 0.15456231087446212,
|
|
"num_tokens": 5332310.0,
|
|
"step": 3035
|
|
},
|
|
{
|
|
"entropy": 5.744946050643921,
|
|
"epoch": 2.3824451410658307,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00046738420613768716,
|
|
"loss": 5.5901,
|
|
"mean_token_accuracy": 0.15420550107955933,
|
|
"num_tokens": 5340757.0,
|
|
"step": 3040
|
|
},
|
|
{
|
|
"entropy": 5.799703979492188,
|
|
"epoch": 2.3863636363636362,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00046722805604142614,
|
|
"loss": 5.573,
|
|
"mean_token_accuracy": 0.1541631817817688,
|
|
"num_tokens": 5349433.0,
|
|
"step": 3045
|
|
},
|
|
{
|
|
"entropy": 5.661766195297242,
|
|
"epoch": 2.3902821316614418,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00046707156240246076,
|
|
"loss": 5.5251,
|
|
"mean_token_accuracy": 0.15046581178903579,
|
|
"num_tokens": 5358943.0,
|
|
"step": 3050
|
|
},
|
|
{
|
|
"entropy": 5.66755428314209,
|
|
"epoch": 2.3942006269592477,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00046691472550047027,
|
|
"loss": 5.4398,
|
|
"mean_token_accuracy": 0.16168949156999587,
|
|
"num_tokens": 5367782.0,
|
|
"step": 3055
|
|
},
|
|
{
|
|
"entropy": 5.573693132400512,
|
|
"epoch": 2.3981191222570533,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00046675754561574783,
|
|
"loss": 5.3238,
|
|
"mean_token_accuracy": 0.17014608532190323,
|
|
"num_tokens": 5376186.0,
|
|
"step": 3060
|
|
},
|
|
{
|
|
"entropy": 5.694865655899048,
|
|
"epoch": 2.402037617554859,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00046660002302919933,
|
|
"loss": 5.5838,
|
|
"mean_token_accuracy": 0.15487318187952043,
|
|
"num_tokens": 5384888.0,
|
|
"step": 3065
|
|
},
|
|
{
|
|
"entropy": 5.747969770431519,
|
|
"epoch": 2.4059561128526648,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004664421580223433,
|
|
"loss": 5.4618,
|
|
"mean_token_accuracy": 0.1624412640929222,
|
|
"num_tokens": 5393394.0,
|
|
"step": 3070
|
|
},
|
|
{
|
|
"entropy": 5.710650634765625,
|
|
"epoch": 2.4098746081504703,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00046628395087730995,
|
|
"loss": 5.5272,
|
|
"mean_token_accuracy": 0.15911675691604615,
|
|
"num_tokens": 5402901.0,
|
|
"step": 3075
|
|
},
|
|
{
|
|
"entropy": 5.690752744674683,
|
|
"epoch": 2.413793103448276,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004661254018768411,
|
|
"loss": 5.5223,
|
|
"mean_token_accuracy": 0.15797928348183632,
|
|
"num_tokens": 5412704.0,
|
|
"step": 3080
|
|
},
|
|
{
|
|
"entropy": 5.631660890579224,
|
|
"epoch": 2.4177115987460813,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004659665113042897,
|
|
"loss": 5.4885,
|
|
"mean_token_accuracy": 0.16516524255275727,
|
|
"num_tokens": 5420863.0,
|
|
"step": 3085
|
|
},
|
|
{
|
|
"entropy": 5.601195001602173,
|
|
"epoch": 2.4216300940438873,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004658072794436187,
|
|
"loss": 5.4378,
|
|
"mean_token_accuracy": 0.1611546367406845,
|
|
"num_tokens": 5430002.0,
|
|
"step": 3090
|
|
},
|
|
{
|
|
"entropy": 5.615525960922241,
|
|
"epoch": 2.425548589341693,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00046564770657940146,
|
|
"loss": 5.4165,
|
|
"mean_token_accuracy": 0.16358508318662643,
|
|
"num_tokens": 5438424.0,
|
|
"step": 3095
|
|
},
|
|
{
|
|
"entropy": 5.588921403884887,
|
|
"epoch": 2.4294670846394983,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004654877929968205,
|
|
"loss": 5.4199,
|
|
"mean_token_accuracy": 0.16612940281629562,
|
|
"num_tokens": 5446970.0,
|
|
"step": 3100
|
|
},
|
|
{
|
|
"entropy": 5.691753959655761,
|
|
"epoch": 2.433385579937304,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004653275389816673,
|
|
"loss": 5.5083,
|
|
"mean_token_accuracy": 0.15316883698105813,
|
|
"num_tokens": 5456689.0,
|
|
"step": 3105
|
|
},
|
|
{
|
|
"entropy": 5.600510358810425,
|
|
"epoch": 2.43730407523511,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00046516694482034174,
|
|
"loss": 5.3974,
|
|
"mean_token_accuracy": 0.16697040051221848,
|
|
"num_tokens": 5465269.0,
|
|
"step": 3110
|
|
},
|
|
{
|
|
"entropy": 5.6311359882354735,
|
|
"epoch": 2.4412225705329154,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00046500601079985164,
|
|
"loss": 5.4653,
|
|
"mean_token_accuracy": 0.1607479929924011,
|
|
"num_tokens": 5473930.0,
|
|
"step": 3115
|
|
},
|
|
{
|
|
"entropy": 5.596141195297241,
|
|
"epoch": 2.445141065830721,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004648447372078123,
|
|
"loss": 5.4947,
|
|
"mean_token_accuracy": 0.1638297162950039,
|
|
"num_tokens": 5482885.0,
|
|
"step": 3120
|
|
},
|
|
{
|
|
"entropy": 5.646732997894287,
|
|
"epoch": 2.449059561128527,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004646831243324457,
|
|
"loss": 5.4329,
|
|
"mean_token_accuracy": 0.16114894300699234,
|
|
"num_tokens": 5491489.0,
|
|
"step": 3125
|
|
},
|
|
{
|
|
"entropy": 5.697061824798584,
|
|
"epoch": 2.4529780564263324,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004645211724625802,
|
|
"loss": 5.4942,
|
|
"mean_token_accuracy": 0.1590850308537483,
|
|
"num_tokens": 5499694.0,
|
|
"step": 3130
|
|
},
|
|
{
|
|
"entropy": 5.564854335784912,
|
|
"epoch": 2.456896551724138,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00046435888188765015,
|
|
"loss": 5.3795,
|
|
"mean_token_accuracy": 0.16652424037456512,
|
|
"num_tokens": 5508037.0,
|
|
"step": 3135
|
|
},
|
|
{
|
|
"entropy": 5.583059644699096,
|
|
"epoch": 2.4608150470219434,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004641962528976951,
|
|
"loss": 5.3679,
|
|
"mean_token_accuracy": 0.16881446093320845,
|
|
"num_tokens": 5516644.0,
|
|
"step": 3140
|
|
},
|
|
{
|
|
"entropy": 5.611056900024414,
|
|
"epoch": 2.4647335423197494,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004640332857833593,
|
|
"loss": 5.4417,
|
|
"mean_token_accuracy": 0.15885722637176514,
|
|
"num_tokens": 5524789.0,
|
|
"step": 3145
|
|
},
|
|
{
|
|
"entropy": 5.682217168807983,
|
|
"epoch": 2.468652037617555,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00046386998083589156,
|
|
"loss": 5.4746,
|
|
"mean_token_accuracy": 0.16199354231357574,
|
|
"num_tokens": 5533739.0,
|
|
"step": 3150
|
|
},
|
|
{
|
|
"entropy": 5.724471616744995,
|
|
"epoch": 2.4725705329153604,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004637063383471442,
|
|
"loss": 5.4943,
|
|
"mean_token_accuracy": 0.16090431064367294,
|
|
"num_tokens": 5542735.0,
|
|
"step": 3155
|
|
},
|
|
{
|
|
"entropy": 5.5542542934417725,
|
|
"epoch": 2.476489028213166,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00046354235860957287,
|
|
"loss": 5.3446,
|
|
"mean_token_accuracy": 0.17048244625329972,
|
|
"num_tokens": 5551272.0,
|
|
"step": 3160
|
|
},
|
|
{
|
|
"entropy": 5.644335508346558,
|
|
"epoch": 2.480407523510972,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004633780419162361,
|
|
"loss": 5.5349,
|
|
"mean_token_accuracy": 0.1576820582151413,
|
|
"num_tokens": 5559868.0,
|
|
"step": 3165
|
|
},
|
|
{
|
|
"entropy": 5.637001800537109,
|
|
"epoch": 2.4843260188087775,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00046321338856079435,
|
|
"loss": 5.3278,
|
|
"mean_token_accuracy": 0.17181121557950974,
|
|
"num_tokens": 5568276.0,
|
|
"step": 3170
|
|
},
|
|
{
|
|
"entropy": 5.593462800979614,
|
|
"epoch": 2.488244514106583,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00046304839883750987,
|
|
"loss": 5.4098,
|
|
"mean_token_accuracy": 0.1587411031126976,
|
|
"num_tokens": 5576462.0,
|
|
"step": 3175
|
|
},
|
|
{
|
|
"entropy": 5.67059473991394,
|
|
"epoch": 2.492163009404389,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.000462883073041246,
|
|
"loss": 5.5546,
|
|
"mean_token_accuracy": 0.15721471160650252,
|
|
"num_tokens": 5586044.0,
|
|
"step": 3180
|
|
},
|
|
{
|
|
"entropy": 5.487815952301025,
|
|
"epoch": 2.4960815047021945,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004627174114674669,
|
|
"loss": 5.3507,
|
|
"mean_token_accuracy": 0.16307419240474702,
|
|
"num_tokens": 5594626.0,
|
|
"step": 3185
|
|
},
|
|
{
|
|
"entropy": 5.688052940368652,
|
|
"epoch": 2.5,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004625514144122365,
|
|
"loss": 5.4132,
|
|
"mean_token_accuracy": 0.15807681083679198,
|
|
"num_tokens": 5603268.0,
|
|
"step": 3190
|
|
},
|
|
{
|
|
"entropy": 5.582400751113892,
|
|
"epoch": 2.5039184952978055,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004623850821722185,
|
|
"loss": 5.3728,
|
|
"mean_token_accuracy": 0.16239736527204512,
|
|
"num_tokens": 5612053.0,
|
|
"step": 3195
|
|
},
|
|
{
|
|
"entropy": 5.606302261352539,
|
|
"epoch": 2.507836990595611,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004622184150446756,
|
|
"loss": 5.6454,
|
|
"mean_token_accuracy": 0.14613621309399605,
|
|
"num_tokens": 5621297.0,
|
|
"step": 3200
|
|
},
|
|
{
|
|
"entropy": 5.737848901748658,
|
|
"epoch": 2.511755485893417,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00046205141332746904,
|
|
"loss": 5.4563,
|
|
"mean_token_accuracy": 0.16175559759140015,
|
|
"num_tokens": 5629683.0,
|
|
"step": 3205
|
|
},
|
|
{
|
|
"entropy": 5.516789770126342,
|
|
"epoch": 2.5156739811912225,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00046188407731905787,
|
|
"loss": 5.3899,
|
|
"mean_token_accuracy": 0.16603742241859437,
|
|
"num_tokens": 5638707.0,
|
|
"step": 3210
|
|
},
|
|
{
|
|
"entropy": 5.679071044921875,
|
|
"epoch": 2.519592476489028,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004617164073184987,
|
|
"loss": 5.4352,
|
|
"mean_token_accuracy": 0.16149932891130447,
|
|
"num_tokens": 5648241.0,
|
|
"step": 3215
|
|
},
|
|
{
|
|
"entropy": 5.612945175170898,
|
|
"epoch": 2.523510971786834,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00046154840362544496,
|
|
"loss": 5.4819,
|
|
"mean_token_accuracy": 0.15746894627809524,
|
|
"num_tokens": 5656743.0,
|
|
"step": 3220
|
|
},
|
|
{
|
|
"entropy": 5.615722131729126,
|
|
"epoch": 2.5274294670846396,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004613800665401466,
|
|
"loss": 5.4624,
|
|
"mean_token_accuracy": 0.1578985258936882,
|
|
"num_tokens": 5666418.0,
|
|
"step": 3225
|
|
},
|
|
{
|
|
"entropy": 5.603916788101197,
|
|
"epoch": 2.531347962382445,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004612113963634493,
|
|
"loss": 5.4203,
|
|
"mean_token_accuracy": 0.15906696319580077,
|
|
"num_tokens": 5675404.0,
|
|
"step": 3230
|
|
},
|
|
{
|
|
"entropy": 5.617366218566895,
|
|
"epoch": 2.535266457680251,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004610423933967938,
|
|
"loss": 5.4933,
|
|
"mean_token_accuracy": 0.15284438133239747,
|
|
"num_tokens": 5684699.0,
|
|
"step": 3235
|
|
},
|
|
{
|
|
"entropy": 5.690285110473633,
|
|
"epoch": 2.5391849529780566,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000460873057942216,
|
|
"loss": 5.4887,
|
|
"mean_token_accuracy": 0.16056734919548035,
|
|
"num_tokens": 5693219.0,
|
|
"step": 3240
|
|
},
|
|
{
|
|
"entropy": 5.702360773086548,
|
|
"epoch": 2.543103448275862,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004607033903023458,
|
|
"loss": 5.522,
|
|
"mean_token_accuracy": 0.15981431901454926,
|
|
"num_tokens": 5702005.0,
|
|
"step": 3245
|
|
},
|
|
{
|
|
"entropy": 5.733270788192749,
|
|
"epoch": 2.5470219435736676,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00046053339078040674,
|
|
"loss": 5.5047,
|
|
"mean_token_accuracy": 0.15839407444000245,
|
|
"num_tokens": 5710715.0,
|
|
"step": 3250
|
|
},
|
|
{
|
|
"entropy": 5.598942708969116,
|
|
"epoch": 2.550940438871473,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004603630596802155,
|
|
"loss": 5.4215,
|
|
"mean_token_accuracy": 0.16150805950164795,
|
|
"num_tokens": 5719444.0,
|
|
"step": 3255
|
|
},
|
|
{
|
|
"entropy": 5.517152309417725,
|
|
"epoch": 2.554858934169279,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004601923973061814,
|
|
"loss": 5.4324,
|
|
"mean_token_accuracy": 0.15545494109392166,
|
|
"num_tokens": 5727755.0,
|
|
"step": 3260
|
|
},
|
|
{
|
|
"entropy": 5.615048742294311,
|
|
"epoch": 2.5587774294670846,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00046002140396330575,
|
|
"loss": 5.335,
|
|
"mean_token_accuracy": 0.1671610489487648,
|
|
"num_tokens": 5736933.0,
|
|
"step": 3265
|
|
},
|
|
{
|
|
"entropy": 5.673175382614136,
|
|
"epoch": 2.56269592476489,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00045985007995718154,
|
|
"loss": 5.517,
|
|
"mean_token_accuracy": 0.1642877921462059,
|
|
"num_tokens": 5745831.0,
|
|
"step": 3270
|
|
},
|
|
{
|
|
"entropy": 5.678241300582886,
|
|
"epoch": 2.566614420062696,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004596784255939923,
|
|
"loss": 5.5531,
|
|
"mean_token_accuracy": 0.15819532945752143,
|
|
"num_tokens": 5755419.0,
|
|
"step": 3275
|
|
},
|
|
{
|
|
"entropy": 5.631372880935669,
|
|
"epoch": 2.5705329153605017,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004595064411805123,
|
|
"loss": 5.4329,
|
|
"mean_token_accuracy": 0.16395663022994994,
|
|
"num_tokens": 5764008.0,
|
|
"step": 3280
|
|
},
|
|
{
|
|
"entropy": 5.574886322021484,
|
|
"epoch": 2.574451410658307,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004593341270241057,
|
|
"loss": 5.3526,
|
|
"mean_token_accuracy": 0.1645262286067009,
|
|
"num_tokens": 5772129.0,
|
|
"step": 3285
|
|
},
|
|
{
|
|
"entropy": 5.605494165420533,
|
|
"epoch": 2.5783699059561127,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004591614834327257,
|
|
"loss": 5.4728,
|
|
"mean_token_accuracy": 0.1581420123577118,
|
|
"num_tokens": 5781079.0,
|
|
"step": 3290
|
|
},
|
|
{
|
|
"entropy": 5.610899257659912,
|
|
"epoch": 2.5822884012539182,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00045898851071491444,
|
|
"loss": 5.4462,
|
|
"mean_token_accuracy": 0.16424137055873872,
|
|
"num_tokens": 5790444.0,
|
|
"step": 3295
|
|
},
|
|
{
|
|
"entropy": 5.533226203918457,
|
|
"epoch": 2.586206896551724,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004588152091798022,
|
|
"loss": 5.3204,
|
|
"mean_token_accuracy": 0.16239900141954422,
|
|
"num_tokens": 5799828.0,
|
|
"step": 3300
|
|
},
|
|
{
|
|
"entropy": 5.688413190841675,
|
|
"epoch": 2.5901253918495297,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004586415791371069,
|
|
"loss": 5.6048,
|
|
"mean_token_accuracy": 0.14757455736398697,
|
|
"num_tokens": 5808765.0,
|
|
"step": 3305
|
|
},
|
|
{
|
|
"entropy": 5.726306390762329,
|
|
"epoch": 2.5940438871473352,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004584676208971336,
|
|
"loss": 5.3726,
|
|
"mean_token_accuracy": 0.16757129430770873,
|
|
"num_tokens": 5817281.0,
|
|
"step": 3310
|
|
},
|
|
{
|
|
"entropy": 5.520856428146362,
|
|
"epoch": 2.597962382445141,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00045829333477077384,
|
|
"loss": 5.4159,
|
|
"mean_token_accuracy": 0.17432797700166702,
|
|
"num_tokens": 5825741.0,
|
|
"step": 3315
|
|
},
|
|
{
|
|
"entropy": 5.61427526473999,
|
|
"epoch": 2.6018808777429467,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004581187210695053,
|
|
"loss": 5.3821,
|
|
"mean_token_accuracy": 0.1676635965704918,
|
|
"num_tokens": 5834466.0,
|
|
"step": 3320
|
|
},
|
|
{
|
|
"entropy": 5.560508775711059,
|
|
"epoch": 2.6057993730407523,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.000457943780105391,
|
|
"loss": 5.4323,
|
|
"mean_token_accuracy": 0.16290040761232377,
|
|
"num_tokens": 5843463.0,
|
|
"step": 3325
|
|
},
|
|
{
|
|
"entropy": 5.613198900222779,
|
|
"epoch": 2.6097178683385582,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00045776851219107856,
|
|
"loss": 5.4575,
|
|
"mean_token_accuracy": 0.15541307330131532,
|
|
"num_tokens": 5851885.0,
|
|
"step": 3330
|
|
},
|
|
{
|
|
"entropy": 5.691914033889771,
|
|
"epoch": 2.6136363636363638,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00045759291763980035,
|
|
"loss": 5.4538,
|
|
"mean_token_accuracy": 0.16642256081104279,
|
|
"num_tokens": 5861537.0,
|
|
"step": 3335
|
|
},
|
|
{
|
|
"entropy": 5.5215497493743895,
|
|
"epoch": 2.6175548589341693,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00045741699676537227,
|
|
"loss": 5.3842,
|
|
"mean_token_accuracy": 0.16373006626963615,
|
|
"num_tokens": 5870332.0,
|
|
"step": 3340
|
|
},
|
|
{
|
|
"entropy": 5.634155559539795,
|
|
"epoch": 2.621473354231975,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00045724074988219343,
|
|
"loss": 5.5963,
|
|
"mean_token_accuracy": 0.15622956901788712,
|
|
"num_tokens": 5879240.0,
|
|
"step": 3345
|
|
},
|
|
{
|
|
"entropy": 5.525441694259643,
|
|
"epoch": 2.6253918495297803,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00045706417730524565,
|
|
"loss": 5.3479,
|
|
"mean_token_accuracy": 0.1612783044576645,
|
|
"num_tokens": 5887961.0,
|
|
"step": 3350
|
|
},
|
|
{
|
|
"entropy": 5.667619323730468,
|
|
"epoch": 2.6293103448275863,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004568872793500927,
|
|
"loss": 5.5637,
|
|
"mean_token_accuracy": 0.1625274196267128,
|
|
"num_tokens": 5896551.0,
|
|
"step": 3355
|
|
},
|
|
{
|
|
"entropy": 5.583553075790405,
|
|
"epoch": 2.633228840125392,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00045671005633287986,
|
|
"loss": 5.3704,
|
|
"mean_token_accuracy": 0.16467083990573883,
|
|
"num_tokens": 5905485.0,
|
|
"step": 3360
|
|
},
|
|
{
|
|
"entropy": 5.567110204696656,
|
|
"epoch": 2.6371473354231973,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004565325085703336,
|
|
"loss": 5.4011,
|
|
"mean_token_accuracy": 0.16050159335136413,
|
|
"num_tokens": 5915166.0,
|
|
"step": 3365
|
|
},
|
|
{
|
|
"entropy": 5.640669679641723,
|
|
"epoch": 2.6410658307210033,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004563546363797602,
|
|
"loss": 5.4344,
|
|
"mean_token_accuracy": 0.16531543731689452,
|
|
"num_tokens": 5923782.0,
|
|
"step": 3370
|
|
},
|
|
{
|
|
"entropy": 5.621229076385498,
|
|
"epoch": 2.644984326018809,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004561764400790465,
|
|
"loss": 5.528,
|
|
"mean_token_accuracy": 0.15729653984308242,
|
|
"num_tokens": 5931904.0,
|
|
"step": 3375
|
|
},
|
|
{
|
|
"entropy": 5.6186549186706545,
|
|
"epoch": 2.6489028213166144,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00045599791998665796,
|
|
"loss": 5.4374,
|
|
"mean_token_accuracy": 0.16541447937488557,
|
|
"num_tokens": 5940067.0,
|
|
"step": 3380
|
|
},
|
|
{
|
|
"entropy": 5.616342639923095,
|
|
"epoch": 2.6528213166144203,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004558190764216389,
|
|
"loss": 5.4176,
|
|
"mean_token_accuracy": 0.16869635432958602,
|
|
"num_tokens": 5948663.0,
|
|
"step": 3385
|
|
},
|
|
{
|
|
"entropy": 5.684710073471069,
|
|
"epoch": 2.656739811912226,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004556399097036119,
|
|
"loss": 5.4253,
|
|
"mean_token_accuracy": 0.1626061663031578,
|
|
"num_tokens": 5957224.0,
|
|
"step": 3390
|
|
},
|
|
{
|
|
"entropy": 5.622564649581909,
|
|
"epoch": 2.6606583072100314,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004554604201527768,
|
|
"loss": 5.5362,
|
|
"mean_token_accuracy": 0.1558627665042877,
|
|
"num_tokens": 5965604.0,
|
|
"step": 3395
|
|
},
|
|
{
|
|
"entropy": 5.571040296554566,
|
|
"epoch": 2.664576802507837,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00045528060808991075,
|
|
"loss": 5.3626,
|
|
"mean_token_accuracy": 0.16696271449327468,
|
|
"num_tokens": 5974309.0,
|
|
"step": 3400
|
|
},
|
|
{
|
|
"entropy": 5.546795654296875,
|
|
"epoch": 2.6684952978056424,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004551004738363669,
|
|
"loss": 5.3774,
|
|
"mean_token_accuracy": 0.1614482581615448,
|
|
"num_tokens": 5982315.0,
|
|
"step": 3405
|
|
},
|
|
{
|
|
"entropy": 5.631612110137939,
|
|
"epoch": 2.6724137931034484,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00045492001771407434,
|
|
"loss": 5.3955,
|
|
"mean_token_accuracy": 0.17008334398269653,
|
|
"num_tokens": 5990515.0,
|
|
"step": 3410
|
|
},
|
|
{
|
|
"entropy": 5.632847738265991,
|
|
"epoch": 2.676332288401254,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004547392400455374,
|
|
"loss": 5.5513,
|
|
"mean_token_accuracy": 0.1599658966064453,
|
|
"num_tokens": 6000431.0,
|
|
"step": 3415
|
|
},
|
|
{
|
|
"entropy": 5.612500715255737,
|
|
"epoch": 2.6802507836990594,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004545581411538353,
|
|
"loss": 5.4756,
|
|
"mean_token_accuracy": 0.15511866062879562,
|
|
"num_tokens": 6009022.0,
|
|
"step": 3420
|
|
},
|
|
{
|
|
"entropy": 5.65234489440918,
|
|
"epoch": 2.6841692789968654,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00045437672136262083,
|
|
"loss": 5.4005,
|
|
"mean_token_accuracy": 0.16104743182659148,
|
|
"num_tokens": 6019330.0,
|
|
"step": 3425
|
|
},
|
|
{
|
|
"entropy": 5.538012981414795,
|
|
"epoch": 2.688087774294671,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004541949809961208,
|
|
"loss": 5.4257,
|
|
"mean_token_accuracy": 0.17339180707931517,
|
|
"num_tokens": 6027492.0,
|
|
"step": 3430
|
|
},
|
|
{
|
|
"entropy": 5.658903741836548,
|
|
"epoch": 2.6920062695924765,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004540129203791346,
|
|
"loss": 5.4545,
|
|
"mean_token_accuracy": 0.16489011645317078,
|
|
"num_tokens": 6036467.0,
|
|
"step": 3435
|
|
},
|
|
{
|
|
"entropy": 5.572162246704101,
|
|
"epoch": 2.695924764890282,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00045383053983703413,
|
|
"loss": 5.407,
|
|
"mean_token_accuracy": 0.16315893530845643,
|
|
"num_tokens": 6045711.0,
|
|
"step": 3440
|
|
},
|
|
{
|
|
"entropy": 5.554269123077392,
|
|
"epoch": 2.6998432601880875,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00045364783969576296,
|
|
"loss": 5.4705,
|
|
"mean_token_accuracy": 0.16008178889751434,
|
|
"num_tokens": 6054626.0,
|
|
"step": 3445
|
|
},
|
|
{
|
|
"entropy": 5.646736717224121,
|
|
"epoch": 2.7037617554858935,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00045346482028183583,
|
|
"loss": 5.534,
|
|
"mean_token_accuracy": 0.1618146926164627,
|
|
"num_tokens": 6064118.0,
|
|
"step": 3450
|
|
},
|
|
{
|
|
"entropy": 5.637236022949219,
|
|
"epoch": 2.707680250783699,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00045328148192233823,
|
|
"loss": 5.4373,
|
|
"mean_token_accuracy": 0.16148429214954377,
|
|
"num_tokens": 6072723.0,
|
|
"step": 3455
|
|
},
|
|
{
|
|
"entropy": 5.633076858520508,
|
|
"epoch": 2.7115987460815045,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004530978249449254,
|
|
"loss": 5.4056,
|
|
"mean_token_accuracy": 0.17186853736639024,
|
|
"num_tokens": 6081535.0,
|
|
"step": 3460
|
|
},
|
|
{
|
|
"entropy": 5.590723657608033,
|
|
"epoch": 2.7155172413793105,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004529138496778222,
|
|
"loss": 5.3805,
|
|
"mean_token_accuracy": 0.16959122717380523,
|
|
"num_tokens": 6090524.0,
|
|
"step": 3465
|
|
},
|
|
{
|
|
"entropy": 5.596432876586914,
|
|
"epoch": 2.719435736677116,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004527295564498222,
|
|
"loss": 5.4465,
|
|
"mean_token_accuracy": 0.16445804834365846,
|
|
"num_tokens": 6099870.0,
|
|
"step": 3470
|
|
},
|
|
{
|
|
"entropy": 5.6235284328460695,
|
|
"epoch": 2.7233542319749215,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004525449455902874,
|
|
"loss": 5.4235,
|
|
"mean_token_accuracy": 0.16751915067434311,
|
|
"num_tokens": 6107760.0,
|
|
"step": 3475
|
|
},
|
|
{
|
|
"entropy": 5.6068034172058105,
|
|
"epoch": 2.7272727272727275,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004523600174291473,
|
|
"loss": 5.4883,
|
|
"mean_token_accuracy": 0.16328197419643403,
|
|
"num_tokens": 6117135.0,
|
|
"step": 3480
|
|
},
|
|
{
|
|
"entropy": 5.559489727020264,
|
|
"epoch": 2.731191222570533,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004521747722968985,
|
|
"loss": 5.4378,
|
|
"mean_token_accuracy": 0.1646498441696167,
|
|
"num_tokens": 6125711.0,
|
|
"step": 3485
|
|
},
|
|
{
|
|
"entropy": 5.658491802215576,
|
|
"epoch": 2.7351097178683386,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00045198921052460396,
|
|
"loss": 5.5131,
|
|
"mean_token_accuracy": 0.15748890489339828,
|
|
"num_tokens": 6134623.0,
|
|
"step": 3490
|
|
},
|
|
{
|
|
"entropy": 5.7147058010101315,
|
|
"epoch": 2.739028213166144,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004518033324438928,
|
|
"loss": 5.4871,
|
|
"mean_token_accuracy": 0.15890766084194183,
|
|
"num_tokens": 6143228.0,
|
|
"step": 3495
|
|
},
|
|
{
|
|
"entropy": 5.552452516555786,
|
|
"epoch": 2.7429467084639496,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004516171383869593,
|
|
"loss": 5.469,
|
|
"mean_token_accuracy": 0.1633103907108307,
|
|
"num_tokens": 6152021.0,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"epoch": 2.7429467084639496,
|
|
"eval_entropy": 5.414383830026138,
|
|
"eval_loss": 5.8682355880737305,
|
|
"eval_mean_token_accuracy": 0.15274352232326371,
|
|
"eval_num_tokens": 6152021.0,
|
|
"eval_runtime": 2.8317,
|
|
"eval_samples_per_second": 1455.64,
|
|
"eval_steps_per_second": 182.22,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"entropy": 5.643088293075562,
|
|
"epoch": 2.7468652037617556,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00045143062868656234,
|
|
"loss": 5.5211,
|
|
"mean_token_accuracy": 0.1622963473200798,
|
|
"num_tokens": 6161389.0,
|
|
"step": 3505
|
|
},
|
|
{
|
|
"entropy": 5.6994085788726805,
|
|
"epoch": 2.750783699059561,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000451243803676025,
|
|
"loss": 5.4581,
|
|
"mean_token_accuracy": 0.16562668979167938,
|
|
"num_tokens": 6170381.0,
|
|
"step": 3510
|
|
},
|
|
{
|
|
"entropy": 5.587498950958252,
|
|
"epoch": 2.7547021943573666,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00045105666368923397,
|
|
"loss": 5.4058,
|
|
"mean_token_accuracy": 0.1639431193470955,
|
|
"num_tokens": 6179071.0,
|
|
"step": 3515
|
|
},
|
|
{
|
|
"entropy": 5.541202402114868,
|
|
"epoch": 2.7586206896551726,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00045086920906063866,
|
|
"loss": 5.4143,
|
|
"mean_token_accuracy": 0.1619719982147217,
|
|
"num_tokens": 6187896.0,
|
|
"step": 3520
|
|
},
|
|
{
|
|
"entropy": 5.555576372146606,
|
|
"epoch": 2.762539184952978,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00045068144012525095,
|
|
"loss": 5.3908,
|
|
"mean_token_accuracy": 0.17135699987411498,
|
|
"num_tokens": 6196328.0,
|
|
"step": 3525
|
|
},
|
|
{
|
|
"entropy": 5.511662530899048,
|
|
"epoch": 2.7664576802507836,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00045049335721864426,
|
|
"loss": 5.4228,
|
|
"mean_token_accuracy": 0.15722364485263823,
|
|
"num_tokens": 6204703.0,
|
|
"step": 3530
|
|
},
|
|
{
|
|
"entropy": 5.598114967346191,
|
|
"epoch": 2.7703761755485896,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00045030496067695336,
|
|
"loss": 5.4198,
|
|
"mean_token_accuracy": 0.16125878989696502,
|
|
"num_tokens": 6213496.0,
|
|
"step": 3535
|
|
},
|
|
{
|
|
"entropy": 5.602514410018921,
|
|
"epoch": 2.774294670846395,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004501162508368733,
|
|
"loss": 5.3708,
|
|
"mean_token_accuracy": 0.16854795217514038,
|
|
"num_tokens": 6222112.0,
|
|
"step": 3540
|
|
},
|
|
{
|
|
"entropy": 5.5321849346160885,
|
|
"epoch": 2.7782131661442007,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004499272280356594,
|
|
"loss": 5.4907,
|
|
"mean_token_accuracy": 0.1533934846520424,
|
|
"num_tokens": 6230288.0,
|
|
"step": 3545
|
|
},
|
|
{
|
|
"entropy": 5.587235784530639,
|
|
"epoch": 2.782131661442006,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004497378926111257,
|
|
"loss": 5.345,
|
|
"mean_token_accuracy": 0.17369745969772338,
|
|
"num_tokens": 6238608.0,
|
|
"step": 3550
|
|
},
|
|
{
|
|
"entropy": 5.627431440353393,
|
|
"epoch": 2.7860501567398117,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004495482449016456,
|
|
"loss": 5.4798,
|
|
"mean_token_accuracy": 0.16244604885578157,
|
|
"num_tokens": 6247902.0,
|
|
"step": 3555
|
|
},
|
|
{
|
|
"entropy": 5.574715185165405,
|
|
"epoch": 2.7899686520376177,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004493582852461501,
|
|
"loss": 5.4332,
|
|
"mean_token_accuracy": 0.16872989386320114,
|
|
"num_tokens": 6257433.0,
|
|
"step": 3560
|
|
},
|
|
{
|
|
"entropy": 5.572787284851074,
|
|
"epoch": 2.793887147335423,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004491680139841281,
|
|
"loss": 5.4158,
|
|
"mean_token_accuracy": 0.16313964426517485,
|
|
"num_tokens": 6266017.0,
|
|
"step": 3565
|
|
},
|
|
{
|
|
"entropy": 5.630651569366455,
|
|
"epoch": 2.7978056426332287,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.000448977431455625,
|
|
"loss": 5.5203,
|
|
"mean_token_accuracy": 0.15141590163111687,
|
|
"num_tokens": 6274759.0,
|
|
"step": 3570
|
|
},
|
|
{
|
|
"entropy": 5.641405630111694,
|
|
"epoch": 2.8017241379310347,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00044878653800124285,
|
|
"loss": 5.3845,
|
|
"mean_token_accuracy": 0.16816764771938325,
|
|
"num_tokens": 6283699.0,
|
|
"step": 3575
|
|
},
|
|
{
|
|
"entropy": 5.567658567428589,
|
|
"epoch": 2.80564263322884,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004485953339621391,
|
|
"loss": 5.4174,
|
|
"mean_token_accuracy": 0.16217992901802064,
|
|
"num_tokens": 6292562.0,
|
|
"step": 3580
|
|
},
|
|
{
|
|
"entropy": 5.533685970306396,
|
|
"epoch": 2.8095611285266457,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004484038196800265,
|
|
"loss": 5.3776,
|
|
"mean_token_accuracy": 0.1700371041893959,
|
|
"num_tokens": 6300928.0,
|
|
"step": 3585
|
|
},
|
|
{
|
|
"entropy": 5.546323251724243,
|
|
"epoch": 2.8134796238244513,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004482119954971719,
|
|
"loss": 5.3093,
|
|
"mean_token_accuracy": 0.16769690960645675,
|
|
"num_tokens": 6309994.0,
|
|
"step": 3590
|
|
},
|
|
{
|
|
"entropy": 5.619518089294433,
|
|
"epoch": 2.817398119122257,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00044801986175639635,
|
|
"loss": 5.4345,
|
|
"mean_token_accuracy": 0.16344505101442336,
|
|
"num_tokens": 6318856.0,
|
|
"step": 3595
|
|
},
|
|
{
|
|
"entropy": 5.5285991668701175,
|
|
"epoch": 2.8213166144200628,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004478274188010741,
|
|
"loss": 5.4324,
|
|
"mean_token_accuracy": 0.1600890651345253,
|
|
"num_tokens": 6327274.0,
|
|
"step": 3600
|
|
},
|
|
{
|
|
"entropy": 5.677964973449707,
|
|
"epoch": 2.8252351097178683,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00044763466697513173,
|
|
"loss": 5.491,
|
|
"mean_token_accuracy": 0.17462221533060074,
|
|
"num_tokens": 6335957.0,
|
|
"step": 3605
|
|
},
|
|
{
|
|
"entropy": 5.668510293960571,
|
|
"epoch": 2.829153605015674,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00044744160662304805,
|
|
"loss": 5.4987,
|
|
"mean_token_accuracy": 0.16850878596305846,
|
|
"num_tokens": 6344673.0,
|
|
"step": 3610
|
|
},
|
|
{
|
|
"entropy": 5.561325788497925,
|
|
"epoch": 2.83307210031348,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00044724823808985325,
|
|
"loss": 5.3639,
|
|
"mean_token_accuracy": 0.1669614925980568,
|
|
"num_tokens": 6353085.0,
|
|
"step": 3615
|
|
},
|
|
{
|
|
"entropy": 5.608777952194214,
|
|
"epoch": 2.8369905956112853,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004470545617211283,
|
|
"loss": 5.4259,
|
|
"mean_token_accuracy": 0.16076537147164344,
|
|
"num_tokens": 6362107.0,
|
|
"step": 3620
|
|
},
|
|
{
|
|
"entropy": 5.584958410263061,
|
|
"epoch": 2.840909090909091,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00044686057786300423,
|
|
"loss": 5.3999,
|
|
"mean_token_accuracy": 0.16815428733825682,
|
|
"num_tokens": 6370381.0,
|
|
"step": 3625
|
|
},
|
|
{
|
|
"entropy": 5.557851696014405,
|
|
"epoch": 2.844827586206897,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00044666628686216154,
|
|
"loss": 5.4355,
|
|
"mean_token_accuracy": 0.16155474483966828,
|
|
"num_tokens": 6378790.0,
|
|
"step": 3630
|
|
},
|
|
{
|
|
"entropy": 5.510302448272705,
|
|
"epoch": 2.8487460815047023,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00044647168906583,
|
|
"loss": 5.4135,
|
|
"mean_token_accuracy": 0.16343684494495392,
|
|
"num_tokens": 6387997.0,
|
|
"step": 3635
|
|
},
|
|
{
|
|
"entropy": 5.608069324493409,
|
|
"epoch": 2.852664576802508,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00044627678482178716,
|
|
"loss": 5.4346,
|
|
"mean_token_accuracy": 0.16666958928108216,
|
|
"num_tokens": 6397458.0,
|
|
"step": 3640
|
|
},
|
|
{
|
|
"entropy": 5.595707035064697,
|
|
"epoch": 2.8565830721003134,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004460815744783587,
|
|
"loss": 5.3743,
|
|
"mean_token_accuracy": 0.16629649698734283,
|
|
"num_tokens": 6405481.0,
|
|
"step": 3645
|
|
},
|
|
{
|
|
"entropy": 5.604511451721192,
|
|
"epoch": 2.860501567398119,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.000445886058384417,
|
|
"loss": 5.4924,
|
|
"mean_token_accuracy": 0.15888008326292039,
|
|
"num_tokens": 6414087.0,
|
|
"step": 3650
|
|
},
|
|
{
|
|
"entropy": 5.607482194900513,
|
|
"epoch": 2.864420062695925,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004456902368893811,
|
|
"loss": 5.3191,
|
|
"mean_token_accuracy": 0.1759709596633911,
|
|
"num_tokens": 6421934.0,
|
|
"step": 3655
|
|
},
|
|
{
|
|
"entropy": 5.54454607963562,
|
|
"epoch": 2.8683385579937304,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004454941103432158,
|
|
"loss": 5.3414,
|
|
"mean_token_accuracy": 0.1674615979194641,
|
|
"num_tokens": 6431028.0,
|
|
"step": 3660
|
|
},
|
|
{
|
|
"entropy": 5.46548752784729,
|
|
"epoch": 2.872257053291536,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00044529767909643093,
|
|
"loss": 5.2974,
|
|
"mean_token_accuracy": 0.17255107015371324,
|
|
"num_tokens": 6439913.0,
|
|
"step": 3665
|
|
},
|
|
{
|
|
"entropy": 5.61080904006958,
|
|
"epoch": 2.876175548589342,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004451009435000811,
|
|
"loss": 5.3955,
|
|
"mean_token_accuracy": 0.16260457783937454,
|
|
"num_tokens": 6449161.0,
|
|
"step": 3670
|
|
},
|
|
{
|
|
"entropy": 5.580524539947509,
|
|
"epoch": 2.8800940438871474,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004449039039057647,
|
|
"loss": 5.5008,
|
|
"mean_token_accuracy": 0.16669165194034577,
|
|
"num_tokens": 6457834.0,
|
|
"step": 3675
|
|
},
|
|
{
|
|
"entropy": 5.509667015075683,
|
|
"epoch": 2.884012539184953,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00044470656066562336,
|
|
"loss": 5.3213,
|
|
"mean_token_accuracy": 0.17255610674619676,
|
|
"num_tokens": 6465879.0,
|
|
"step": 3680
|
|
},
|
|
{
|
|
"entropy": 5.502434110641479,
|
|
"epoch": 2.887931034482759,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004445089141323415,
|
|
"loss": 5.3251,
|
|
"mean_token_accuracy": 0.16640148162841797,
|
|
"num_tokens": 6474801.0,
|
|
"step": 3685
|
|
},
|
|
{
|
|
"entropy": 5.540452337265014,
|
|
"epoch": 2.8918495297805644,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00044431096465914554,
|
|
"loss": 5.3657,
|
|
"mean_token_accuracy": 0.1723470151424408,
|
|
"num_tokens": 6483532.0,
|
|
"step": 3690
|
|
},
|
|
{
|
|
"entropy": 5.532334518432617,
|
|
"epoch": 2.89576802507837,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00044411271259980315,
|
|
"loss": 5.3465,
|
|
"mean_token_accuracy": 0.1612808346748352,
|
|
"num_tokens": 6491372.0,
|
|
"step": 3695
|
|
},
|
|
{
|
|
"entropy": 5.501596736907959,
|
|
"epoch": 2.8996865203761755,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004439141583086231,
|
|
"loss": 5.3612,
|
|
"mean_token_accuracy": 0.16393718719482422,
|
|
"num_tokens": 6500001.0,
|
|
"step": 3700
|
|
},
|
|
{
|
|
"entropy": 5.669638776779175,
|
|
"epoch": 2.903605015673981,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00044371530214045395,
|
|
"loss": 5.4632,
|
|
"mean_token_accuracy": 0.16313758194446565,
|
|
"num_tokens": 6507750.0,
|
|
"step": 3705
|
|
},
|
|
{
|
|
"entropy": 5.489681100845337,
|
|
"epoch": 2.907523510971787,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00044351614445068413,
|
|
"loss": 5.3241,
|
|
"mean_token_accuracy": 0.16466565132141114,
|
|
"num_tokens": 6517888.0,
|
|
"step": 3710
|
|
},
|
|
{
|
|
"entropy": 5.614689445495605,
|
|
"epoch": 2.9114420062695925,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00044331668559524043,
|
|
"loss": 5.4777,
|
|
"mean_token_accuracy": 0.1595204085111618,
|
|
"num_tokens": 6526573.0,
|
|
"step": 3715
|
|
},
|
|
{
|
|
"entropy": 5.553948974609375,
|
|
"epoch": 2.915360501567398,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004431169259305883,
|
|
"loss": 5.3968,
|
|
"mean_token_accuracy": 0.16814989745616912,
|
|
"num_tokens": 6536544.0,
|
|
"step": 3720
|
|
},
|
|
{
|
|
"entropy": 5.6320888042449955,
|
|
"epoch": 2.919278996865204,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004429168658137306,
|
|
"loss": 5.4218,
|
|
"mean_token_accuracy": 0.16275950148701668,
|
|
"num_tokens": 6544982.0,
|
|
"step": 3725
|
|
},
|
|
{
|
|
"entropy": 5.551886749267578,
|
|
"epoch": 2.9231974921630095,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00044271650560220746,
|
|
"loss": 5.4456,
|
|
"mean_token_accuracy": 0.16412553191184998,
|
|
"num_tokens": 6553664.0,
|
|
"step": 3730
|
|
},
|
|
{
|
|
"entropy": 5.588349199295044,
|
|
"epoch": 2.927115987460815,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00044251584565409464,
|
|
"loss": 5.4212,
|
|
"mean_token_accuracy": 0.16144543141126633,
|
|
"num_tokens": 6562234.0,
|
|
"step": 3735
|
|
},
|
|
{
|
|
"entropy": 5.522047853469848,
|
|
"epoch": 2.9310344827586206,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004423148863280044,
|
|
"loss": 5.3825,
|
|
"mean_token_accuracy": 0.16582272052764893,
|
|
"num_tokens": 6570707.0,
|
|
"step": 3740
|
|
},
|
|
{
|
|
"entropy": 5.623364686965942,
|
|
"epoch": 2.934952978056426,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00044211362798308334,
|
|
"loss": 5.4298,
|
|
"mean_token_accuracy": 0.16516808271408082,
|
|
"num_tokens": 6580054.0,
|
|
"step": 3745
|
|
},
|
|
{
|
|
"entropy": 5.5762903690338135,
|
|
"epoch": 2.938871473354232,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004419120709790129,
|
|
"loss": 5.487,
|
|
"mean_token_accuracy": 0.16166716068983078,
|
|
"num_tokens": 6588585.0,
|
|
"step": 3750
|
|
},
|
|
{
|
|
"entropy": 5.612081909179688,
|
|
"epoch": 2.9427899686520376,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00044171021567600814,
|
|
"loss": 5.4734,
|
|
"mean_token_accuracy": 0.16496185213327408,
|
|
"num_tokens": 6597290.0,
|
|
"step": 3755
|
|
},
|
|
{
|
|
"entropy": 5.599429178237915,
|
|
"epoch": 2.946708463949843,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00044150806243481715,
|
|
"loss": 5.3989,
|
|
"mean_token_accuracy": 0.1622632399201393,
|
|
"num_tokens": 6604966.0,
|
|
"step": 3760
|
|
},
|
|
{
|
|
"entropy": 5.605414915084839,
|
|
"epoch": 2.950626959247649,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004413056116167206,
|
|
"loss": 5.3836,
|
|
"mean_token_accuracy": 0.16811488717794418,
|
|
"num_tokens": 6613588.0,
|
|
"step": 3765
|
|
},
|
|
{
|
|
"entropy": 5.694739866256714,
|
|
"epoch": 2.9545454545454546,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004411028635835309,
|
|
"loss": 5.6062,
|
|
"mean_token_accuracy": 0.14811502546072006,
|
|
"num_tokens": 6622907.0,
|
|
"step": 3770
|
|
},
|
|
{
|
|
"entropy": 5.5383378028869625,
|
|
"epoch": 2.95846394984326,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004408998186975917,
|
|
"loss": 5.3947,
|
|
"mean_token_accuracy": 0.16675213277339934,
|
|
"num_tokens": 6632074.0,
|
|
"step": 3775
|
|
},
|
|
{
|
|
"entropy": 5.624499273300171,
|
|
"epoch": 2.962382445141066,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00044069647732177696,
|
|
"loss": 5.4263,
|
|
"mean_token_accuracy": 0.16085358709096909,
|
|
"num_tokens": 6640595.0,
|
|
"step": 3780
|
|
},
|
|
{
|
|
"entropy": 5.641710186004639,
|
|
"epoch": 2.9663009404388716,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00044049283981949103,
|
|
"loss": 5.4727,
|
|
"mean_token_accuracy": 0.15902462005615234,
|
|
"num_tokens": 6649861.0,
|
|
"step": 3785
|
|
},
|
|
{
|
|
"entropy": 5.660275220870972,
|
|
"epoch": 2.970219435736677,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004402889065546667,
|
|
"loss": 5.4867,
|
|
"mean_token_accuracy": 0.15586813762784005,
|
|
"num_tokens": 6659157.0,
|
|
"step": 3790
|
|
},
|
|
{
|
|
"entropy": 5.599219799041748,
|
|
"epoch": 2.9741379310344827,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00044008467789176625,
|
|
"loss": 5.4518,
|
|
"mean_token_accuracy": 0.16005596220493318,
|
|
"num_tokens": 6667979.0,
|
|
"step": 3795
|
|
},
|
|
{
|
|
"entropy": 5.590052700042724,
|
|
"epoch": 2.978056426332288,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004398801541957791,
|
|
"loss": 5.4034,
|
|
"mean_token_accuracy": 0.16107274889945983,
|
|
"num_tokens": 6677239.0,
|
|
"step": 3800
|
|
},
|
|
{
|
|
"entropy": 5.590973901748657,
|
|
"epoch": 2.981974921630094,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004396753358322223,
|
|
"loss": 5.5295,
|
|
"mean_token_accuracy": 0.15927643179893494,
|
|
"num_tokens": 6687317.0,
|
|
"step": 3805
|
|
},
|
|
{
|
|
"entropy": 5.551632881164551,
|
|
"epoch": 2.9858934169278997,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004394702231671396,
|
|
"loss": 5.413,
|
|
"mean_token_accuracy": 0.161776627600193,
|
|
"num_tokens": 6695732.0,
|
|
"step": 3810
|
|
},
|
|
{
|
|
"entropy": 5.546460056304932,
|
|
"epoch": 2.989811912225705,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004392648165671004,
|
|
"loss": 5.3254,
|
|
"mean_token_accuracy": 0.17594451904296876,
|
|
"num_tokens": 6704272.0,
|
|
"step": 3815
|
|
},
|
|
{
|
|
"entropy": 5.552908229827881,
|
|
"epoch": 2.993730407523511,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004390591163991998,
|
|
"loss": 5.3706,
|
|
"mean_token_accuracy": 0.16397657990455627,
|
|
"num_tokens": 6713355.0,
|
|
"step": 3820
|
|
},
|
|
{
|
|
"entropy": 5.587871837615967,
|
|
"epoch": 2.9976489028213167,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00043885312303105725,
|
|
"loss": 5.4658,
|
|
"mean_token_accuracy": 0.1653267815709114,
|
|
"num_tokens": 6721952.0,
|
|
"step": 3825
|
|
},
|
|
{
|
|
"entropy": 5.543058776855469,
|
|
"epoch": 3.001567398119122,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004386468368308163,
|
|
"loss": 5.2734,
|
|
"mean_token_accuracy": 0.17160003930330275,
|
|
"num_tokens": 6730582.0,
|
|
"step": 3830
|
|
},
|
|
{
|
|
"entropy": 5.568611288070679,
|
|
"epoch": 3.0054858934169277,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004384402581671438,
|
|
"loss": 5.0631,
|
|
"mean_token_accuracy": 0.17696447372436525,
|
|
"num_tokens": 6739370.0,
|
|
"step": 3835
|
|
},
|
|
{
|
|
"entropy": 5.526371908187866,
|
|
"epoch": 3.0094043887147337,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004382333874092295,
|
|
"loss": 5.0431,
|
|
"mean_token_accuracy": 0.18205696493387222,
|
|
"num_tokens": 6748282.0,
|
|
"step": 3840
|
|
},
|
|
{
|
|
"entropy": 5.6190471172332765,
|
|
"epoch": 3.0133228840125392,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00043802622492678466,
|
|
"loss": 5.1684,
|
|
"mean_token_accuracy": 0.1718669578433037,
|
|
"num_tokens": 6757622.0,
|
|
"step": 3845
|
|
},
|
|
{
|
|
"entropy": 5.560507535934448,
|
|
"epoch": 3.0172413793103448,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004378187710900426,
|
|
"loss": 5.1056,
|
|
"mean_token_accuracy": 0.17592935264110565,
|
|
"num_tokens": 6765780.0,
|
|
"step": 3850
|
|
},
|
|
{
|
|
"entropy": 5.544382572174072,
|
|
"epoch": 3.0211598746081503,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00043761102626975674,
|
|
"loss": 5.1099,
|
|
"mean_token_accuracy": 0.17691410183906556,
|
|
"num_tokens": 6774343.0,
|
|
"step": 3855
|
|
},
|
|
{
|
|
"entropy": 5.529460048675537,
|
|
"epoch": 3.0250783699059562,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004374029908372007,
|
|
"loss": 5.1041,
|
|
"mean_token_accuracy": 0.1808921843767166,
|
|
"num_tokens": 6783765.0,
|
|
"step": 3860
|
|
},
|
|
{
|
|
"entropy": 5.6187409400939945,
|
|
"epoch": 3.0289968652037618,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00043719466516416774,
|
|
"loss": 5.166,
|
|
"mean_token_accuracy": 0.17624239325523378,
|
|
"num_tokens": 6792102.0,
|
|
"step": 3865
|
|
},
|
|
{
|
|
"entropy": 5.602795743942261,
|
|
"epoch": 3.0329153605015673,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00043698604962296946,
|
|
"loss": 5.09,
|
|
"mean_token_accuracy": 0.17620307356119155,
|
|
"num_tokens": 6801435.0,
|
|
"step": 3870
|
|
},
|
|
{
|
|
"entropy": 5.456713485717773,
|
|
"epoch": 3.0368338557993733,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00043677714458643566,
|
|
"loss": 5.1205,
|
|
"mean_token_accuracy": 0.1735727608203888,
|
|
"num_tokens": 6810409.0,
|
|
"step": 3875
|
|
},
|
|
{
|
|
"entropy": 5.4551129817962645,
|
|
"epoch": 3.040752351097179,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00043656795042791357,
|
|
"loss": 5.0066,
|
|
"mean_token_accuracy": 0.18492254316806794,
|
|
"num_tokens": 6818752.0,
|
|
"step": 3880
|
|
},
|
|
{
|
|
"entropy": 5.5124578952789305,
|
|
"epoch": 3.0446708463949843,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004363584675212671,
|
|
"loss": 5.0905,
|
|
"mean_token_accuracy": 0.17838650196790695,
|
|
"num_tokens": 6827880.0,
|
|
"step": 3885
|
|
},
|
|
{
|
|
"entropy": 5.454521226882934,
|
|
"epoch": 3.04858934169279,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004361486962408761,
|
|
"loss": 5.0865,
|
|
"mean_token_accuracy": 0.18407093435525895,
|
|
"num_tokens": 6835865.0,
|
|
"step": 3890
|
|
},
|
|
{
|
|
"entropy": 5.49459433555603,
|
|
"epoch": 3.052507836990596,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004359386369616359,
|
|
"loss": 5.0443,
|
|
"mean_token_accuracy": 0.1849384769797325,
|
|
"num_tokens": 6843851.0,
|
|
"step": 3895
|
|
},
|
|
{
|
|
"entropy": 5.471823406219483,
|
|
"epoch": 3.0564263322884013,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004357282900589565,
|
|
"loss": 5.0965,
|
|
"mean_token_accuracy": 0.1811446502804756,
|
|
"num_tokens": 6852649.0,
|
|
"step": 3900
|
|
},
|
|
{
|
|
"entropy": 5.47971544265747,
|
|
"epoch": 3.060344827586207,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00043551765590876183,
|
|
"loss": 5.0536,
|
|
"mean_token_accuracy": 0.18287423402070999,
|
|
"num_tokens": 6861607.0,
|
|
"step": 3905
|
|
},
|
|
{
|
|
"entropy": 5.440199613571167,
|
|
"epoch": 3.0642633228840124,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004353067348874894,
|
|
"loss": 5.0757,
|
|
"mean_token_accuracy": 0.1747647225856781,
|
|
"num_tokens": 6870650.0,
|
|
"step": 3910
|
|
},
|
|
{
|
|
"entropy": 5.555355358123779,
|
|
"epoch": 3.0681818181818183,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00043509552737208923,
|
|
"loss": 5.1226,
|
|
"mean_token_accuracy": 0.167980919778347,
|
|
"num_tokens": 6879820.0,
|
|
"step": 3915
|
|
},
|
|
{
|
|
"entropy": 5.418352794647217,
|
|
"epoch": 3.072100313479624,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004348840337400233,
|
|
"loss": 5.0409,
|
|
"mean_token_accuracy": 0.18314133137464522,
|
|
"num_tokens": 6888329.0,
|
|
"step": 3920
|
|
},
|
|
{
|
|
"entropy": 5.443490362167358,
|
|
"epoch": 3.0760188087774294,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00043467225436926517,
|
|
"loss": 5.1256,
|
|
"mean_token_accuracy": 0.17643692940473557,
|
|
"num_tokens": 6897635.0,
|
|
"step": 3925
|
|
},
|
|
{
|
|
"entropy": 5.520525932312012,
|
|
"epoch": 3.079937304075235,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004344601896382988,
|
|
"loss": 5.1636,
|
|
"mean_token_accuracy": 0.18288996070623398,
|
|
"num_tokens": 6907275.0,
|
|
"step": 3930
|
|
},
|
|
{
|
|
"entropy": 5.523024606704712,
|
|
"epoch": 3.083855799373041,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00043424783992611837,
|
|
"loss": 5.1091,
|
|
"mean_token_accuracy": 0.17494795471429825,
|
|
"num_tokens": 6916377.0,
|
|
"step": 3935
|
|
},
|
|
{
|
|
"entropy": 5.4875256538391115,
|
|
"epoch": 3.0877742946708464,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00043403520561222705,
|
|
"loss": 5.0059,
|
|
"mean_token_accuracy": 0.19060440957546235,
|
|
"num_tokens": 6924355.0,
|
|
"step": 3940
|
|
},
|
|
{
|
|
"entropy": 5.420040082931519,
|
|
"epoch": 3.091692789968652,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004338222870766371,
|
|
"loss": 5.1104,
|
|
"mean_token_accuracy": 0.1827986016869545,
|
|
"num_tokens": 6933328.0,
|
|
"step": 3945
|
|
},
|
|
{
|
|
"entropy": 5.550303268432617,
|
|
"epoch": 3.0956112852664575,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00043360908469986827,
|
|
"loss": 5.1158,
|
|
"mean_token_accuracy": 0.1787843018770218,
|
|
"num_tokens": 6942189.0,
|
|
"step": 3950
|
|
},
|
|
{
|
|
"entropy": 5.462327671051026,
|
|
"epoch": 3.0995297805642634,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004333955988629478,
|
|
"loss": 5.076,
|
|
"mean_token_accuracy": 0.1802074134349823,
|
|
"num_tokens": 6951188.0,
|
|
"step": 3955
|
|
},
|
|
{
|
|
"entropy": 5.432707262039185,
|
|
"epoch": 3.103448275862069,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00043318182994740945,
|
|
"loss": 5.0336,
|
|
"mean_token_accuracy": 0.18666609823703767,
|
|
"num_tokens": 6958718.0,
|
|
"step": 3960
|
|
},
|
|
{
|
|
"entropy": 5.511406707763672,
|
|
"epoch": 3.1073667711598745,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004329677783352931,
|
|
"loss": 5.1119,
|
|
"mean_token_accuracy": 0.17655452340841293,
|
|
"num_tokens": 6968040.0,
|
|
"step": 3965
|
|
},
|
|
{
|
|
"entropy": 5.499683237075805,
|
|
"epoch": 3.1112852664576804,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004327534444091436,
|
|
"loss": 5.1113,
|
|
"mean_token_accuracy": 0.1770559012889862,
|
|
"num_tokens": 6976977.0,
|
|
"step": 3970
|
|
},
|
|
{
|
|
"entropy": 5.445043706893921,
|
|
"epoch": 3.115203761755486,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00043253882855201037,
|
|
"loss": 5.0948,
|
|
"mean_token_accuracy": 0.18210539519786834,
|
|
"num_tokens": 6985859.0,
|
|
"step": 3975
|
|
},
|
|
{
|
|
"entropy": 5.35231466293335,
|
|
"epoch": 3.1191222570532915,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00043232393114744683,
|
|
"loss": 4.9988,
|
|
"mean_token_accuracy": 0.19316509366035461,
|
|
"num_tokens": 6994031.0,
|
|
"step": 3980
|
|
},
|
|
{
|
|
"entropy": 5.412227296829224,
|
|
"epoch": 3.123040752351097,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004321087525795095,
|
|
"loss": 5.0556,
|
|
"mean_token_accuracy": 0.18158914446830748,
|
|
"num_tokens": 7002527.0,
|
|
"step": 3985
|
|
},
|
|
{
|
|
"entropy": 5.5304759502410885,
|
|
"epoch": 3.126959247648903,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004318932932327573,
|
|
"loss": 5.1739,
|
|
"mean_token_accuracy": 0.1824020892381668,
|
|
"num_tokens": 7011983.0,
|
|
"step": 3990
|
|
},
|
|
{
|
|
"entropy": 5.5312965393066404,
|
|
"epoch": 3.1308777429467085,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.000431677553492251,
|
|
"loss": 5.119,
|
|
"mean_token_accuracy": 0.1740437164902687,
|
|
"num_tokens": 7020819.0,
|
|
"step": 3995
|
|
},
|
|
{
|
|
"entropy": 5.410348653793335,
|
|
"epoch": 3.134796238244514,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00043146153374355256,
|
|
"loss": 5.006,
|
|
"mean_token_accuracy": 0.18658973425626754,
|
|
"num_tokens": 7029257.0,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"epoch": 3.134796238244514,
|
|
"eval_entropy": 5.3644778922546745,
|
|
"eval_loss": 5.828073978424072,
|
|
"eval_mean_token_accuracy": 0.15688188456345437,
|
|
"eval_num_tokens": 7029257.0,
|
|
"eval_runtime": 2.8339,
|
|
"eval_samples_per_second": 1454.534,
|
|
"eval_steps_per_second": 182.081,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"entropy": 5.479976797103882,
|
|
"epoch": 3.1387147335423196,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00043124523437272427,
|
|
"loss": 5.1671,
|
|
"mean_token_accuracy": 0.1771426811814308,
|
|
"num_tokens": 7038942.0,
|
|
"step": 4005
|
|
},
|
|
{
|
|
"entropy": 5.525044679641724,
|
|
"epoch": 3.1426332288401255,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004310286557663282,
|
|
"loss": 5.1505,
|
|
"mean_token_accuracy": 0.17958650290966033,
|
|
"num_tokens": 7048074.0,
|
|
"step": 4010
|
|
},
|
|
{
|
|
"entropy": 5.52232346534729,
|
|
"epoch": 3.146551724137931,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004308117983114254,
|
|
"loss": 5.0553,
|
|
"mean_token_accuracy": 0.18663895428180693,
|
|
"num_tokens": 7056887.0,
|
|
"step": 4015
|
|
},
|
|
{
|
|
"entropy": 5.435352230072022,
|
|
"epoch": 3.1504702194357366,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004305946623955754,
|
|
"loss": 5.0523,
|
|
"mean_token_accuracy": 0.19024786949157715,
|
|
"num_tokens": 7065462.0,
|
|
"step": 4020
|
|
},
|
|
{
|
|
"entropy": 5.471118497848511,
|
|
"epoch": 3.1543887147335425,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00043037724840683516,
|
|
"loss": 5.1474,
|
|
"mean_token_accuracy": 0.17960819303989412,
|
|
"num_tokens": 7074188.0,
|
|
"step": 4025
|
|
},
|
|
{
|
|
"entropy": 5.477698230743409,
|
|
"epoch": 3.158307210031348,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00043015955673375876,
|
|
"loss": 5.122,
|
|
"mean_token_accuracy": 0.18004380017518998,
|
|
"num_tokens": 7082282.0,
|
|
"step": 4030
|
|
},
|
|
{
|
|
"entropy": 5.50840573310852,
|
|
"epoch": 3.1622257053291536,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004299415877653966,
|
|
"loss": 5.1723,
|
|
"mean_token_accuracy": 0.17248133569955826,
|
|
"num_tokens": 7090631.0,
|
|
"step": 4035
|
|
},
|
|
{
|
|
"entropy": 5.473033285140991,
|
|
"epoch": 3.166144200626959,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004297233418912945,
|
|
"loss": 5.1086,
|
|
"mean_token_accuracy": 0.17797058075666428,
|
|
"num_tokens": 7099546.0,
|
|
"step": 4040
|
|
},
|
|
{
|
|
"entropy": 5.474239778518677,
|
|
"epoch": 3.170062695924765,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004295048195014932,
|
|
"loss": 5.1465,
|
|
"mean_token_accuracy": 0.1752048373222351,
|
|
"num_tokens": 7109423.0,
|
|
"step": 4045
|
|
},
|
|
{
|
|
"entropy": 5.412190675735474,
|
|
"epoch": 3.1739811912225706,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004292860209865277,
|
|
"loss": 5.0146,
|
|
"mean_token_accuracy": 0.185761658847332,
|
|
"num_tokens": 7117717.0,
|
|
"step": 4050
|
|
},
|
|
{
|
|
"entropy": 5.462385177612305,
|
|
"epoch": 3.177899686520376,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004290669467374263,
|
|
"loss": 5.2407,
|
|
"mean_token_accuracy": 0.1639290541410446,
|
|
"num_tokens": 7126573.0,
|
|
"step": 4055
|
|
},
|
|
{
|
|
"entropy": 5.47260046005249,
|
|
"epoch": 3.1818181818181817,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00042884759714571037,
|
|
"loss": 5.1361,
|
|
"mean_token_accuracy": 0.17528676837682725,
|
|
"num_tokens": 7134471.0,
|
|
"step": 4060
|
|
},
|
|
{
|
|
"entropy": 5.362086915969849,
|
|
"epoch": 3.1857366771159876,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004286279726033932,
|
|
"loss": 5.0691,
|
|
"mean_token_accuracy": 0.18219161480665208,
|
|
"num_tokens": 7143092.0,
|
|
"step": 4065
|
|
},
|
|
{
|
|
"entropy": 5.519996929168701,
|
|
"epoch": 3.189655172413793,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00042840807350297933,
|
|
"loss": 5.1474,
|
|
"mean_token_accuracy": 0.17893998175859452,
|
|
"num_tokens": 7152137.0,
|
|
"step": 4070
|
|
},
|
|
{
|
|
"entropy": 5.500849103927612,
|
|
"epoch": 3.1935736677115987,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00042818790023746407,
|
|
"loss": 5.1012,
|
|
"mean_token_accuracy": 0.17794661372900009,
|
|
"num_tokens": 7160037.0,
|
|
"step": 4075
|
|
},
|
|
{
|
|
"entropy": 5.409151077270508,
|
|
"epoch": 3.197492163009404,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00042796745320033296,
|
|
"loss": 5.0934,
|
|
"mean_token_accuracy": 0.17724834829568864,
|
|
"num_tokens": 7168354.0,
|
|
"step": 4080
|
|
},
|
|
{
|
|
"entropy": 5.4781488418579105,
|
|
"epoch": 3.20141065830721,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00042774673278556043,
|
|
"loss": 5.1611,
|
|
"mean_token_accuracy": 0.1761411026120186,
|
|
"num_tokens": 7176757.0,
|
|
"step": 4085
|
|
},
|
|
{
|
|
"entropy": 5.479331970214844,
|
|
"epoch": 3.2053291536050157,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004275257393876097,
|
|
"loss": 5.1466,
|
|
"mean_token_accuracy": 0.1715711236000061,
|
|
"num_tokens": 7185489.0,
|
|
"step": 4090
|
|
},
|
|
{
|
|
"entropy": 5.523379135131836,
|
|
"epoch": 3.209247648902821,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004273044734014318,
|
|
"loss": 5.0853,
|
|
"mean_token_accuracy": 0.1794649213552475,
|
|
"num_tokens": 7194431.0,
|
|
"step": 4095
|
|
},
|
|
{
|
|
"entropy": 5.3604803562164305,
|
|
"epoch": 3.2131661442006267,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00042708293522246486,
|
|
"loss": 5.0738,
|
|
"mean_token_accuracy": 0.17980273962020873,
|
|
"num_tokens": 7203074.0,
|
|
"step": 4100
|
|
},
|
|
{
|
|
"entropy": 5.456855535507202,
|
|
"epoch": 3.2170846394984327,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004268611252466337,
|
|
"loss": 5.1036,
|
|
"mean_token_accuracy": 0.17986776679754257,
|
|
"num_tokens": 7211692.0,
|
|
"step": 4105
|
|
},
|
|
{
|
|
"entropy": 5.413537836074829,
|
|
"epoch": 3.2210031347962382,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004266390438703486,
|
|
"loss": 5.093,
|
|
"mean_token_accuracy": 0.17688279300928117,
|
|
"num_tokens": 7220285.0,
|
|
"step": 4110
|
|
},
|
|
{
|
|
"entropy": 5.509257078170776,
|
|
"epoch": 3.2249216300940438,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00042641669149050493,
|
|
"loss": 5.2339,
|
|
"mean_token_accuracy": 0.17110467851161956,
|
|
"num_tokens": 7230007.0,
|
|
"step": 4115
|
|
},
|
|
{
|
|
"entropy": 5.488560152053833,
|
|
"epoch": 3.2288401253918497,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004261940685044825,
|
|
"loss": 5.0905,
|
|
"mean_token_accuracy": 0.17679268568754197,
|
|
"num_tokens": 7238520.0,
|
|
"step": 4120
|
|
},
|
|
{
|
|
"entropy": 5.3976846694946286,
|
|
"epoch": 3.2327586206896552,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00042597117531014474,
|
|
"loss": 5.0265,
|
|
"mean_token_accuracy": 0.18420617133378983,
|
|
"num_tokens": 7247228.0,
|
|
"step": 4125
|
|
},
|
|
{
|
|
"entropy": 5.532719135284424,
|
|
"epoch": 3.2366771159874608,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004257480123058378,
|
|
"loss": 5.2695,
|
|
"mean_token_accuracy": 0.1700225442647934,
|
|
"num_tokens": 7255782.0,
|
|
"step": 4130
|
|
},
|
|
{
|
|
"entropy": 5.508197259902954,
|
|
"epoch": 3.2405956112852663,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00042552457989039036,
|
|
"loss": 5.1426,
|
|
"mean_token_accuracy": 0.1676323667168617,
|
|
"num_tokens": 7264946.0,
|
|
"step": 4135
|
|
},
|
|
{
|
|
"entropy": 5.349348306655884,
|
|
"epoch": 3.2445141065830723,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00042530087846311213,
|
|
"loss": 4.9496,
|
|
"mean_token_accuracy": 0.193049119412899,
|
|
"num_tokens": 7273647.0,
|
|
"step": 4140
|
|
},
|
|
{
|
|
"entropy": 5.459439086914062,
|
|
"epoch": 3.248432601880878,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00042507690842379396,
|
|
"loss": 5.1177,
|
|
"mean_token_accuracy": 0.17937442511320115,
|
|
"num_tokens": 7283393.0,
|
|
"step": 4145
|
|
},
|
|
{
|
|
"entropy": 5.53988881111145,
|
|
"epoch": 3.2523510971786833,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00042485267017270664,
|
|
"loss": 5.2106,
|
|
"mean_token_accuracy": 0.17449098229408264,
|
|
"num_tokens": 7292368.0,
|
|
"step": 4150
|
|
},
|
|
{
|
|
"entropy": 5.410909795761109,
|
|
"epoch": 3.256269592476489,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00042462816411060025,
|
|
"loss": 5.1283,
|
|
"mean_token_accuracy": 0.18129791021347047,
|
|
"num_tokens": 7301505.0,
|
|
"step": 4155
|
|
},
|
|
{
|
|
"entropy": 5.447917032241821,
|
|
"epoch": 3.260188087774295,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004244033906387035,
|
|
"loss": 5.0887,
|
|
"mean_token_accuracy": 0.18481171876192093,
|
|
"num_tokens": 7310011.0,
|
|
"step": 4160
|
|
},
|
|
{
|
|
"entropy": 5.489086627960205,
|
|
"epoch": 3.2641065830721003,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004241783501587231,
|
|
"loss": 5.132,
|
|
"mean_token_accuracy": 0.18020764291286467,
|
|
"num_tokens": 7319295.0,
|
|
"step": 4165
|
|
},
|
|
{
|
|
"entropy": 5.476145839691162,
|
|
"epoch": 3.268025078369906,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00042395304307284284,
|
|
"loss": 5.1608,
|
|
"mean_token_accuracy": 0.1762930765748024,
|
|
"num_tokens": 7328390.0,
|
|
"step": 4170
|
|
},
|
|
{
|
|
"entropy": 5.530626583099365,
|
|
"epoch": 3.271943573667712,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004237274697837229,
|
|
"loss": 5.1891,
|
|
"mean_token_accuracy": 0.1730980709195137,
|
|
"num_tokens": 7337062.0,
|
|
"step": 4175
|
|
},
|
|
{
|
|
"entropy": 5.411122417449951,
|
|
"epoch": 3.2758620689655173,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004235016306944996,
|
|
"loss": 5.1639,
|
|
"mean_token_accuracy": 0.17733618319034578,
|
|
"num_tokens": 7346364.0,
|
|
"step": 4180
|
|
},
|
|
{
|
|
"entropy": 5.473628377914428,
|
|
"epoch": 3.279780564263323,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004232755262087837,
|
|
"loss": 5.1232,
|
|
"mean_token_accuracy": 0.17577927559614182,
|
|
"num_tokens": 7354896.0,
|
|
"step": 4185
|
|
},
|
|
{
|
|
"entropy": 5.492278003692627,
|
|
"epoch": 3.2836990595611284,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00042304915673066083,
|
|
"loss": 5.156,
|
|
"mean_token_accuracy": 0.18079642653465272,
|
|
"num_tokens": 7363591.0,
|
|
"step": 4190
|
|
},
|
|
{
|
|
"entropy": 5.46770281791687,
|
|
"epoch": 3.287617554858934,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00042282252266468985,
|
|
"loss": 5.1541,
|
|
"mean_token_accuracy": 0.1765994980931282,
|
|
"num_tokens": 7372891.0,
|
|
"step": 4195
|
|
},
|
|
{
|
|
"entropy": 5.420037746429443,
|
|
"epoch": 3.29153605015674,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004225956244159025,
|
|
"loss": 5.0849,
|
|
"mean_token_accuracy": 0.18195036351680755,
|
|
"num_tokens": 7381945.0,
|
|
"step": 4200
|
|
},
|
|
{
|
|
"entropy": 5.56228461265564,
|
|
"epoch": 3.2954545454545454,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004223684623898029,
|
|
"loss": 5.2202,
|
|
"mean_token_accuracy": 0.17319831550121306,
|
|
"num_tokens": 7391449.0,
|
|
"step": 4205
|
|
},
|
|
{
|
|
"entropy": 5.467171859741211,
|
|
"epoch": 3.299373040752351,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004221410369923662,
|
|
"loss": 5.0742,
|
|
"mean_token_accuracy": 0.18290328085422516,
|
|
"num_tokens": 7399762.0,
|
|
"step": 4210
|
|
},
|
|
{
|
|
"entropy": 5.36794114112854,
|
|
"epoch": 3.303291536050157,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00042191334863003873,
|
|
"loss": 4.9833,
|
|
"mean_token_accuracy": 0.18501525074243547,
|
|
"num_tokens": 7408711.0,
|
|
"step": 4215
|
|
},
|
|
{
|
|
"entropy": 5.31069974899292,
|
|
"epoch": 3.3072100313479624,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004216853977097363,
|
|
"loss": 5.0883,
|
|
"mean_token_accuracy": 0.1858847111463547,
|
|
"num_tokens": 7416868.0,
|
|
"step": 4220
|
|
},
|
|
{
|
|
"entropy": 5.442585515975952,
|
|
"epoch": 3.311128526645768,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004214571846388442,
|
|
"loss": 5.194,
|
|
"mean_token_accuracy": 0.1712857499718666,
|
|
"num_tokens": 7425687.0,
|
|
"step": 4225
|
|
},
|
|
{
|
|
"entropy": 5.51009168624878,
|
|
"epoch": 3.3150470219435735,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004212287098252164,
|
|
"loss": 5.1079,
|
|
"mean_token_accuracy": 0.1798792377114296,
|
|
"num_tokens": 7433714.0,
|
|
"step": 4230
|
|
},
|
|
{
|
|
"entropy": 5.418067598342896,
|
|
"epoch": 3.3189655172413794,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004209999736771742,
|
|
"loss": 5.1424,
|
|
"mean_token_accuracy": 0.18602844029664994,
|
|
"num_tokens": 7442765.0,
|
|
"step": 4235
|
|
},
|
|
{
|
|
"entropy": 5.437410736083985,
|
|
"epoch": 3.322884012539185,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004207709766035063,
|
|
"loss": 5.1322,
|
|
"mean_token_accuracy": 0.1810468316078186,
|
|
"num_tokens": 7450672.0,
|
|
"step": 4240
|
|
},
|
|
{
|
|
"entropy": 5.498105001449585,
|
|
"epoch": 3.3268025078369905,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004205417190134674,
|
|
"loss": 5.191,
|
|
"mean_token_accuracy": 0.17484226375818251,
|
|
"num_tokens": 7458758.0,
|
|
"step": 4245
|
|
},
|
|
{
|
|
"entropy": 5.497838830947876,
|
|
"epoch": 3.330721003134796,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004203122013167783,
|
|
"loss": 5.1422,
|
|
"mean_token_accuracy": 0.173189277946949,
|
|
"num_tokens": 7467527.0,
|
|
"step": 4250
|
|
},
|
|
{
|
|
"entropy": 5.4545204639434814,
|
|
"epoch": 3.334639498432602,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00042008242392362413,
|
|
"loss": 5.1395,
|
|
"mean_token_accuracy": 0.18133477866649628,
|
|
"num_tokens": 7476659.0,
|
|
"step": 4255
|
|
},
|
|
{
|
|
"entropy": 5.405594730377198,
|
|
"epoch": 3.3385579937304075,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00041985238724465433,
|
|
"loss": 5.0524,
|
|
"mean_token_accuracy": 0.18593618124723435,
|
|
"num_tokens": 7485813.0,
|
|
"step": 4260
|
|
},
|
|
{
|
|
"entropy": 5.499361944198609,
|
|
"epoch": 3.342476489028213,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00041962209169098193,
|
|
"loss": 5.1782,
|
|
"mean_token_accuracy": 0.17913033664226533,
|
|
"num_tokens": 7494842.0,
|
|
"step": 4265
|
|
},
|
|
{
|
|
"entropy": 5.469057226181031,
|
|
"epoch": 3.346394984326019,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004193915376741823,
|
|
"loss": 5.193,
|
|
"mean_token_accuracy": 0.1746675878763199,
|
|
"num_tokens": 7503633.0,
|
|
"step": 4270
|
|
},
|
|
{
|
|
"entropy": 5.455647945404053,
|
|
"epoch": 3.3503134796238245,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004191607256062928,
|
|
"loss": 5.1075,
|
|
"mean_token_accuracy": 0.180385085940361,
|
|
"num_tokens": 7512759.0,
|
|
"step": 4275
|
|
},
|
|
{
|
|
"entropy": 5.456186103820801,
|
|
"epoch": 3.35423197492163,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004189296558998121,
|
|
"loss": 5.2025,
|
|
"mean_token_accuracy": 0.17833039313554763,
|
|
"num_tokens": 7522284.0,
|
|
"step": 4280
|
|
},
|
|
{
|
|
"entropy": 5.457691764831543,
|
|
"epoch": 3.3581504702194356,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004186983289676992,
|
|
"loss": 5.116,
|
|
"mean_token_accuracy": 0.18076436668634416,
|
|
"num_tokens": 7531286.0,
|
|
"step": 4285
|
|
},
|
|
{
|
|
"entropy": 5.451044416427612,
|
|
"epoch": 3.3620689655172415,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00041846674522337296,
|
|
"loss": 5.0976,
|
|
"mean_token_accuracy": 0.17392572164535522,
|
|
"num_tokens": 7540450.0,
|
|
"step": 4290
|
|
},
|
|
{
|
|
"entropy": 5.3857035636901855,
|
|
"epoch": 3.365987460815047,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00041823490508071076,
|
|
"loss": 5.0542,
|
|
"mean_token_accuracy": 0.18191471099853515,
|
|
"num_tokens": 7549386.0,
|
|
"step": 4295
|
|
},
|
|
{
|
|
"entropy": 5.5002960681915285,
|
|
"epoch": 3.3699059561128526,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000418002808954049,
|
|
"loss": 5.1349,
|
|
"mean_token_accuracy": 0.17532447278499602,
|
|
"num_tokens": 7558236.0,
|
|
"step": 4300
|
|
},
|
|
{
|
|
"entropy": 5.335977363586426,
|
|
"epoch": 3.373824451410658,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00041777045725818057,
|
|
"loss": 5.0178,
|
|
"mean_token_accuracy": 0.18048462569713591,
|
|
"num_tokens": 7566984.0,
|
|
"step": 4305
|
|
},
|
|
{
|
|
"entropy": 5.358128404617309,
|
|
"epoch": 3.377742946708464,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.000417537850408356,
|
|
"loss": 5.1446,
|
|
"mean_token_accuracy": 0.1744469001889229,
|
|
"num_tokens": 7576018.0,
|
|
"step": 4310
|
|
},
|
|
{
|
|
"entropy": 5.489906167984008,
|
|
"epoch": 3.3816614420062696,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004173049888202814,
|
|
"loss": 5.15,
|
|
"mean_token_accuracy": 0.18125383108854293,
|
|
"num_tokens": 7585294.0,
|
|
"step": 4315
|
|
},
|
|
{
|
|
"entropy": 5.441188955307007,
|
|
"epoch": 3.385579937304075,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004170718729101179,
|
|
"loss": 5.1048,
|
|
"mean_token_accuracy": 0.18156641870737075,
|
|
"num_tokens": 7594634.0,
|
|
"step": 4320
|
|
},
|
|
{
|
|
"entropy": 5.387170600891113,
|
|
"epoch": 3.389498432601881,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00041683850309448187,
|
|
"loss": 5.1674,
|
|
"mean_token_accuracy": 0.1728657752275467,
|
|
"num_tokens": 7603153.0,
|
|
"step": 4325
|
|
},
|
|
{
|
|
"entropy": 5.456656408309937,
|
|
"epoch": 3.3934169278996866,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00041660487979044264,
|
|
"loss": 5.1014,
|
|
"mean_token_accuracy": 0.1805425301194191,
|
|
"num_tokens": 7612027.0,
|
|
"step": 4330
|
|
},
|
|
{
|
|
"entropy": 5.453139019012451,
|
|
"epoch": 3.397335423197492,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004163710034155231,
|
|
"loss": 5.096,
|
|
"mean_token_accuracy": 0.17648906409740447,
|
|
"num_tokens": 7620725.0,
|
|
"step": 4335
|
|
},
|
|
{
|
|
"entropy": 5.363250875473023,
|
|
"epoch": 3.4012539184952977,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004161368743876982,
|
|
"loss": 5.0648,
|
|
"mean_token_accuracy": 0.18540093004703523,
|
|
"num_tokens": 7628847.0,
|
|
"step": 4340
|
|
},
|
|
{
|
|
"entropy": 5.432746839523316,
|
|
"epoch": 3.405172413793103,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004159024931253945,
|
|
"loss": 5.1639,
|
|
"mean_token_accuracy": 0.1767767533659935,
|
|
"num_tokens": 7637668.0,
|
|
"step": 4345
|
|
},
|
|
{
|
|
"entropy": 5.462332487106323,
|
|
"epoch": 3.409090909090909,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00041566786004748943,
|
|
"loss": 5.2048,
|
|
"mean_token_accuracy": 0.1763555735349655,
|
|
"num_tokens": 7646545.0,
|
|
"step": 4350
|
|
},
|
|
{
|
|
"entropy": 5.489386320114136,
|
|
"epoch": 3.4130094043887147,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00041543297557331015,
|
|
"loss": 5.1754,
|
|
"mean_token_accuracy": 0.1723347634077072,
|
|
"num_tokens": 7654967.0,
|
|
"step": 4355
|
|
},
|
|
{
|
|
"entropy": 5.40674934387207,
|
|
"epoch": 3.41692789968652,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004151978401226335,
|
|
"loss": 5.124,
|
|
"mean_token_accuracy": 0.1786526545882225,
|
|
"num_tokens": 7663718.0,
|
|
"step": 4360
|
|
},
|
|
{
|
|
"entropy": 5.47047872543335,
|
|
"epoch": 3.420846394984326,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00041496245411568435,
|
|
"loss": 5.1677,
|
|
"mean_token_accuracy": 0.17421529591083526,
|
|
"num_tokens": 7672430.0,
|
|
"step": 4365
|
|
},
|
|
{
|
|
"entropy": 5.442583322525024,
|
|
"epoch": 3.4247648902821317,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004147268179731359,
|
|
"loss": 5.1351,
|
|
"mean_token_accuracy": 0.1748214393854141,
|
|
"num_tokens": 7681270.0,
|
|
"step": 4370
|
|
},
|
|
{
|
|
"entropy": 5.454329872131348,
|
|
"epoch": 3.4286833855799372,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00041449093211610815,
|
|
"loss": 5.1524,
|
|
"mean_token_accuracy": 0.17915753722190858,
|
|
"num_tokens": 7690124.0,
|
|
"step": 4375
|
|
},
|
|
{
|
|
"entropy": 5.512417268753052,
|
|
"epoch": 3.4326018808777428,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00041425479696616734,
|
|
"loss": 5.1595,
|
|
"mean_token_accuracy": 0.1722505882382393,
|
|
"num_tokens": 7699511.0,
|
|
"step": 4380
|
|
},
|
|
{
|
|
"entropy": 5.426167726516724,
|
|
"epoch": 3.4365203761755487,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004140184129453253,
|
|
"loss": 5.116,
|
|
"mean_token_accuracy": 0.18224495351314546,
|
|
"num_tokens": 7708796.0,
|
|
"step": 4385
|
|
},
|
|
{
|
|
"entropy": 5.44820647239685,
|
|
"epoch": 3.4404388714733543,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00041378178047603845,
|
|
"loss": 5.1091,
|
|
"mean_token_accuracy": 0.18672927916049958,
|
|
"num_tokens": 7717446.0,
|
|
"step": 4390
|
|
},
|
|
{
|
|
"entropy": 5.462598896026611,
|
|
"epoch": 3.44435736677116,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004135448999812074,
|
|
"loss": 5.1568,
|
|
"mean_token_accuracy": 0.17604882270097733,
|
|
"num_tokens": 7726814.0,
|
|
"step": 4395
|
|
},
|
|
{
|
|
"entropy": 5.400063228607178,
|
|
"epoch": 3.4482758620689653,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004133077718841763,
|
|
"loss": 5.0632,
|
|
"mean_token_accuracy": 0.18461630046367644,
|
|
"num_tokens": 7735895.0,
|
|
"step": 4400
|
|
},
|
|
{
|
|
"entropy": 5.38959813117981,
|
|
"epoch": 3.4521943573667713,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00041307039660873113,
|
|
"loss": 5.1216,
|
|
"mean_token_accuracy": 0.18026788532733917,
|
|
"num_tokens": 7744160.0,
|
|
"step": 4405
|
|
},
|
|
{
|
|
"entropy": 5.483940744400025,
|
|
"epoch": 3.456112852664577,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004128327745791002,
|
|
"loss": 5.2246,
|
|
"mean_token_accuracy": 0.1668292060494423,
|
|
"num_tokens": 7752844.0,
|
|
"step": 4410
|
|
},
|
|
{
|
|
"entropy": 5.458703708648682,
|
|
"epoch": 3.4600313479623823,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004125949062199526,
|
|
"loss": 5.1852,
|
|
"mean_token_accuracy": 0.17616474032402038,
|
|
"num_tokens": 7762327.0,
|
|
"step": 4415
|
|
},
|
|
{
|
|
"entropy": 5.402634143829346,
|
|
"epoch": 3.4639498432601883,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00041235679195639766,
|
|
"loss": 5.0954,
|
|
"mean_token_accuracy": 0.1764765590429306,
|
|
"num_tokens": 7770813.0,
|
|
"step": 4420
|
|
},
|
|
{
|
|
"entropy": 5.373932027816773,
|
|
"epoch": 3.467868338557994,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00041211843221398406,
|
|
"loss": 5.1366,
|
|
"mean_token_accuracy": 0.1720389500260353,
|
|
"num_tokens": 7779357.0,
|
|
"step": 4425
|
|
},
|
|
{
|
|
"entropy": 5.491668367385865,
|
|
"epoch": 3.4717868338557993,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004118798274186994,
|
|
"loss": 5.1477,
|
|
"mean_token_accuracy": 0.17953715473413467,
|
|
"num_tokens": 7787302.0,
|
|
"step": 4430
|
|
},
|
|
{
|
|
"entropy": 5.41265459060669,
|
|
"epoch": 3.475705329153605,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004116409779969691,
|
|
"loss": 5.1765,
|
|
"mean_token_accuracy": 0.17347506135702134,
|
|
"num_tokens": 7795651.0,
|
|
"step": 4435
|
|
},
|
|
{
|
|
"entropy": 5.396276617050171,
|
|
"epoch": 3.479623824451411,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.00041140188437565586,
|
|
"loss": 5.1345,
|
|
"mean_token_accuracy": 0.18256539553403855,
|
|
"num_tokens": 7804644.0,
|
|
"step": 4440
|
|
},
|
|
{
|
|
"entropy": 5.536772632598877,
|
|
"epoch": 3.4835423197492164,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00041116254698205873,
|
|
"loss": 5.1149,
|
|
"mean_token_accuracy": 0.1830981507897377,
|
|
"num_tokens": 7812479.0,
|
|
"step": 4445
|
|
},
|
|
{
|
|
"entropy": 5.399745845794678,
|
|
"epoch": 3.487460815047022,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00041092296624391244,
|
|
"loss": 5.1163,
|
|
"mean_token_accuracy": 0.179618901014328,
|
|
"num_tokens": 7820678.0,
|
|
"step": 4450
|
|
},
|
|
{
|
|
"entropy": 5.3498913764953615,
|
|
"epoch": 3.4913793103448274,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004106831425893865,
|
|
"loss": 5.1459,
|
|
"mean_token_accuracy": 0.18171471655368804,
|
|
"num_tokens": 7829063.0,
|
|
"step": 4455
|
|
},
|
|
{
|
|
"entropy": 5.520697450637817,
|
|
"epoch": 3.4952978056426334,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004104430764470849,
|
|
"loss": 5.18,
|
|
"mean_token_accuracy": 0.17809778749942778,
|
|
"num_tokens": 7837274.0,
|
|
"step": 4460
|
|
},
|
|
{
|
|
"entropy": 5.494675731658935,
|
|
"epoch": 3.499216300940439,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004102027682460445,
|
|
"loss": 5.0874,
|
|
"mean_token_accuracy": 0.1875218093395233,
|
|
"num_tokens": 7845379.0,
|
|
"step": 4465
|
|
},
|
|
{
|
|
"entropy": 5.400750637054443,
|
|
"epoch": 3.5031347962382444,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004099622184157353,
|
|
"loss": 5.1269,
|
|
"mean_token_accuracy": 0.1800209864974022,
|
|
"num_tokens": 7853760.0,
|
|
"step": 4470
|
|
},
|
|
{
|
|
"entropy": 5.526426029205322,
|
|
"epoch": 3.5070532915360504,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004097214273860586,
|
|
"loss": 5.183,
|
|
"mean_token_accuracy": 0.17379164099693298,
|
|
"num_tokens": 7862026.0,
|
|
"step": 4475
|
|
},
|
|
{
|
|
"entropy": 5.451271629333496,
|
|
"epoch": 3.510971786833856,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004094803955873471,
|
|
"loss": 5.1363,
|
|
"mean_token_accuracy": 0.1838302046060562,
|
|
"num_tokens": 7870369.0,
|
|
"step": 4480
|
|
},
|
|
{
|
|
"entropy": 5.4413388729095455,
|
|
"epoch": 3.5148902821316614,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004092391234503638,
|
|
"loss": 5.1357,
|
|
"mean_token_accuracy": 0.18130728155374526,
|
|
"num_tokens": 7878524.0,
|
|
"step": 4485
|
|
},
|
|
{
|
|
"entropy": 5.452888298034668,
|
|
"epoch": 3.518808777429467,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00040899761140630094,
|
|
"loss": 5.1942,
|
|
"mean_token_accuracy": 0.16720112562179565,
|
|
"num_tokens": 7887369.0,
|
|
"step": 4490
|
|
},
|
|
{
|
|
"entropy": 5.4835591316223145,
|
|
"epoch": 3.5227272727272725,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00040875585988677985,
|
|
"loss": 5.1658,
|
|
"mean_token_accuracy": 0.1724832221865654,
|
|
"num_tokens": 7896112.0,
|
|
"step": 4495
|
|
},
|
|
{
|
|
"entropy": 5.361825084686279,
|
|
"epoch": 3.5266457680250785,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004085138693238497,
|
|
"loss": 5.0118,
|
|
"mean_token_accuracy": 0.18901925683021545,
|
|
"num_tokens": 7904580.0,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"epoch": 3.5266457680250785,
|
|
"eval_entropy": 5.337222361749457,
|
|
"eval_loss": 5.80082368850708,
|
|
"eval_mean_token_accuracy": 0.1584607647197653,
|
|
"eval_num_tokens": 7904580.0,
|
|
"eval_runtime": 3.0275,
|
|
"eval_samples_per_second": 1361.513,
|
|
"eval_steps_per_second": 170.437,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"entropy": 5.480594873428345,
|
|
"epoch": 3.530564263322884,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004082716401499867,
|
|
"loss": 5.1681,
|
|
"mean_token_accuracy": 0.17277304977178573,
|
|
"num_tokens": 7913755.0,
|
|
"step": 4505
|
|
},
|
|
{
|
|
"entropy": 5.470868158340454,
|
|
"epoch": 3.5344827586206895,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00040802917279809383,
|
|
"loss": 5.1477,
|
|
"mean_token_accuracy": 0.17918196320533752,
|
|
"num_tokens": 7922385.0,
|
|
"step": 4510
|
|
},
|
|
{
|
|
"entropy": 5.417375326156616,
|
|
"epoch": 3.5384012539184955,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00040778646770149953,
|
|
"loss": 5.1731,
|
|
"mean_token_accuracy": 0.16962890774011613,
|
|
"num_tokens": 7931634.0,
|
|
"step": 4515
|
|
},
|
|
{
|
|
"entropy": 5.406470584869385,
|
|
"epoch": 3.542319749216301,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00040754352529395716,
|
|
"loss": 5.0869,
|
|
"mean_token_accuracy": 0.17542838752269746,
|
|
"num_tokens": 7939866.0,
|
|
"step": 4520
|
|
},
|
|
{
|
|
"entropy": 5.434367752075195,
|
|
"epoch": 3.5462382445141065,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00040730034600964415,
|
|
"loss": 5.2075,
|
|
"mean_token_accuracy": 0.17026106566190718,
|
|
"num_tokens": 7948961.0,
|
|
"step": 4525
|
|
},
|
|
{
|
|
"entropy": 5.289756202697754,
|
|
"epoch": 3.5501567398119125,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004070569302831613,
|
|
"loss": 5.0367,
|
|
"mean_token_accuracy": 0.17900677770376205,
|
|
"num_tokens": 7958526.0,
|
|
"step": 4530
|
|
},
|
|
{
|
|
"entropy": 5.443406343460083,
|
|
"epoch": 3.554075235109718,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.000406813278549532,
|
|
"loss": 5.1621,
|
|
"mean_token_accuracy": 0.17983901798725127,
|
|
"num_tokens": 7966887.0,
|
|
"step": 4535
|
|
},
|
|
{
|
|
"entropy": 5.476217126846313,
|
|
"epoch": 3.5579937304075235,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.00040656939124420144,
|
|
"loss": 5.1251,
|
|
"mean_token_accuracy": 0.17843578457832338,
|
|
"num_tokens": 7974929.0,
|
|
"step": 4540
|
|
},
|
|
{
|
|
"entropy": 5.374083375930786,
|
|
"epoch": 3.561912225705329,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004063252688030358,
|
|
"loss": 5.0863,
|
|
"mean_token_accuracy": 0.18091104477643966,
|
|
"num_tokens": 7983821.0,
|
|
"step": 4545
|
|
},
|
|
{
|
|
"entropy": 5.38077392578125,
|
|
"epoch": 3.5658307210031346,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004060809116623213,
|
|
"loss": 5.1152,
|
|
"mean_token_accuracy": 0.17550334483385086,
|
|
"num_tokens": 7992109.0,
|
|
"step": 4550
|
|
},
|
|
{
|
|
"entropy": 5.365566873550415,
|
|
"epoch": 3.5697492163009406,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.000405836320258764,
|
|
"loss": 5.1301,
|
|
"mean_token_accuracy": 0.1887578547000885,
|
|
"num_tokens": 8001098.0,
|
|
"step": 4555
|
|
},
|
|
{
|
|
"entropy": 5.484101867675781,
|
|
"epoch": 3.573667711598746,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004055914950294882,
|
|
"loss": 5.1373,
|
|
"mean_token_accuracy": 0.17732277810573577,
|
|
"num_tokens": 8009559.0,
|
|
"step": 4560
|
|
},
|
|
{
|
|
"entropy": 5.372460269927979,
|
|
"epoch": 3.5775862068965516,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00040534643641203645,
|
|
"loss": 5.059,
|
|
"mean_token_accuracy": 0.18330815881490709,
|
|
"num_tokens": 8018021.0,
|
|
"step": 4565
|
|
},
|
|
{
|
|
"entropy": 5.381331825256348,
|
|
"epoch": 3.5815047021943576,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004051011448443681,
|
|
"loss": 5.0922,
|
|
"mean_token_accuracy": 0.17764217853546144,
|
|
"num_tokens": 8026258.0,
|
|
"step": 4570
|
|
},
|
|
{
|
|
"entropy": 5.434953689575195,
|
|
"epoch": 3.585423197492163,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.000404855620764859,
|
|
"loss": 5.1622,
|
|
"mean_token_accuracy": 0.18100216686725618,
|
|
"num_tokens": 8036761.0,
|
|
"step": 4575
|
|
},
|
|
{
|
|
"entropy": 5.496177244186401,
|
|
"epoch": 3.5893416927899686,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004046098646123006,
|
|
"loss": 5.1788,
|
|
"mean_token_accuracy": 0.1807200565934181,
|
|
"num_tokens": 8045115.0,
|
|
"step": 4580
|
|
},
|
|
{
|
|
"entropy": 5.508572769165039,
|
|
"epoch": 3.593260188087774,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00040436387682589876,
|
|
"loss": 5.1826,
|
|
"mean_token_accuracy": 0.1770282730460167,
|
|
"num_tokens": 8054334.0,
|
|
"step": 4585
|
|
},
|
|
{
|
|
"entropy": 5.375803804397583,
|
|
"epoch": 3.5971786833855797,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004041176578452737,
|
|
"loss": 5.1423,
|
|
"mean_token_accuracy": 0.1803617998957634,
|
|
"num_tokens": 8063165.0,
|
|
"step": 4590
|
|
},
|
|
{
|
|
"entropy": 5.411370325088501,
|
|
"epoch": 3.6010971786833856,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004038712081104587,
|
|
"loss": 5.1227,
|
|
"mean_token_accuracy": 0.17998642325401307,
|
|
"num_tokens": 8072498.0,
|
|
"step": 4595
|
|
},
|
|
{
|
|
"entropy": 5.437928962707519,
|
|
"epoch": 3.605015673981191,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00040362452806189927,
|
|
"loss": 5.1442,
|
|
"mean_token_accuracy": 0.17676037400960923,
|
|
"num_tokens": 8081455.0,
|
|
"step": 4600
|
|
},
|
|
{
|
|
"entropy": 5.356879949569702,
|
|
"epoch": 3.6089341692789967,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004033776181404527,
|
|
"loss": 5.1483,
|
|
"mean_token_accuracy": 0.17423220127820968,
|
|
"num_tokens": 8090992.0,
|
|
"step": 4605
|
|
},
|
|
{
|
|
"entropy": 5.477758693695068,
|
|
"epoch": 3.6128526645768027,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00040313047878738704,
|
|
"loss": 5.1429,
|
|
"mean_token_accuracy": 0.17423719316720962,
|
|
"num_tokens": 8099976.0,
|
|
"step": 4610
|
|
},
|
|
{
|
|
"entropy": 5.43692717552185,
|
|
"epoch": 3.616771159874608,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004028831104443805,
|
|
"loss": 5.0531,
|
|
"mean_token_accuracy": 0.19072302281856537,
|
|
"num_tokens": 8108275.0,
|
|
"step": 4615
|
|
},
|
|
{
|
|
"entropy": 5.37940616607666,
|
|
"epoch": 3.6206896551724137,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004026355135535202,
|
|
"loss": 5.1237,
|
|
"mean_token_accuracy": 0.18099311292171477,
|
|
"num_tokens": 8117441.0,
|
|
"step": 4620
|
|
},
|
|
{
|
|
"entropy": 5.383903551101684,
|
|
"epoch": 3.6246081504702197,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00040238768855730214,
|
|
"loss": 5.1077,
|
|
"mean_token_accuracy": 0.17485350966453553,
|
|
"num_tokens": 8125709.0,
|
|
"step": 4625
|
|
},
|
|
{
|
|
"entropy": 5.4046632766723635,
|
|
"epoch": 3.628526645768025,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00040213963589862963,
|
|
"loss": 5.1292,
|
|
"mean_token_accuracy": 0.1794295147061348,
|
|
"num_tokens": 8134418.0,
|
|
"step": 4630
|
|
},
|
|
{
|
|
"entropy": 5.482091903686523,
|
|
"epoch": 3.6324451410658307,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004018913560208131,
|
|
"loss": 5.1953,
|
|
"mean_token_accuracy": 0.17151551097631454,
|
|
"num_tokens": 8143729.0,
|
|
"step": 4635
|
|
},
|
|
{
|
|
"entropy": 5.426027917861939,
|
|
"epoch": 3.6363636363636362,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004016428493675689,
|
|
"loss": 5.0717,
|
|
"mean_token_accuracy": 0.1761728420853615,
|
|
"num_tokens": 8152613.0,
|
|
"step": 4640
|
|
},
|
|
{
|
|
"entropy": 5.364865112304687,
|
|
"epoch": 3.6402821316614418,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004013941163830187,
|
|
"loss": 5.0649,
|
|
"mean_token_accuracy": 0.18139948844909667,
|
|
"num_tokens": 8160862.0,
|
|
"step": 4645
|
|
},
|
|
{
|
|
"entropy": 5.3869531631469725,
|
|
"epoch": 3.6442006269592477,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004011451575116887,
|
|
"loss": 5.1554,
|
|
"mean_token_accuracy": 0.1723189875483513,
|
|
"num_tokens": 8169369.0,
|
|
"step": 4650
|
|
},
|
|
{
|
|
"entropy": 5.4002196311950685,
|
|
"epoch": 3.6481191222570533,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004008959731985087,
|
|
"loss": 5.0635,
|
|
"mean_token_accuracy": 0.18389806002378464,
|
|
"num_tokens": 8178152.0,
|
|
"step": 4655
|
|
},
|
|
{
|
|
"entropy": 5.342556095123291,
|
|
"epoch": 3.652037617554859,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00040064656388881157,
|
|
"loss": 5.0368,
|
|
"mean_token_accuracy": 0.18386815786361693,
|
|
"num_tokens": 8186690.0,
|
|
"step": 4660
|
|
},
|
|
{
|
|
"entropy": 5.345205497741699,
|
|
"epoch": 3.6559561128526648,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004003969300283321,
|
|
"loss": 5.0211,
|
|
"mean_token_accuracy": 0.1854146108031273,
|
|
"num_tokens": 8194976.0,
|
|
"step": 4665
|
|
},
|
|
{
|
|
"entropy": 5.428677892684936,
|
|
"epoch": 3.6598746081504703,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00040014707206320653,
|
|
"loss": 5.1304,
|
|
"mean_token_accuracy": 0.1809231385588646,
|
|
"num_tokens": 8202964.0,
|
|
"step": 4670
|
|
},
|
|
{
|
|
"entropy": 5.375498723983765,
|
|
"epoch": 3.663793103448276,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00039989699043997153,
|
|
"loss": 5.1371,
|
|
"mean_token_accuracy": 0.1780979588627815,
|
|
"num_tokens": 8213084.0,
|
|
"step": 4675
|
|
},
|
|
{
|
|
"entropy": 5.407328462600708,
|
|
"epoch": 3.6677115987460818,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00039964668560556356,
|
|
"loss": 5.0542,
|
|
"mean_token_accuracy": 0.18738225400447844,
|
|
"num_tokens": 8221645.0,
|
|
"step": 4680
|
|
},
|
|
{
|
|
"entropy": 5.374656867980957,
|
|
"epoch": 3.6716300940438873,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00039939615800731784,
|
|
"loss": 5.0422,
|
|
"mean_token_accuracy": 0.18308537155389787,
|
|
"num_tokens": 8230195.0,
|
|
"step": 4685
|
|
},
|
|
{
|
|
"entropy": 5.379098272323608,
|
|
"epoch": 3.675548589341693,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00039914540809296795,
|
|
"loss": 5.1367,
|
|
"mean_token_accuracy": 0.17565943598747252,
|
|
"num_tokens": 8239608.0,
|
|
"step": 4690
|
|
},
|
|
{
|
|
"entropy": 5.494941234588623,
|
|
"epoch": 3.6794670846394983,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0003988944363106445,
|
|
"loss": 5.1561,
|
|
"mean_token_accuracy": 0.18333661705255508,
|
|
"num_tokens": 8247993.0,
|
|
"step": 4695
|
|
},
|
|
{
|
|
"entropy": 5.368853759765625,
|
|
"epoch": 3.683385579937304,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0003986432431088749,
|
|
"loss": 5.143,
|
|
"mean_token_accuracy": 0.1748058944940567,
|
|
"num_tokens": 8257279.0,
|
|
"step": 4700
|
|
},
|
|
{
|
|
"entropy": 5.482876777648926,
|
|
"epoch": 3.68730407523511,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.000398391828936582,
|
|
"loss": 5.1776,
|
|
"mean_token_accuracy": 0.1722080945968628,
|
|
"num_tokens": 8265166.0,
|
|
"step": 4705
|
|
},
|
|
{
|
|
"entropy": 5.340915870666504,
|
|
"epoch": 3.6912225705329154,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0003981401942430838,
|
|
"loss": 4.9655,
|
|
"mean_token_accuracy": 0.18662159740924836,
|
|
"num_tokens": 8273281.0,
|
|
"step": 4710
|
|
},
|
|
{
|
|
"entropy": 5.375476741790772,
|
|
"epoch": 3.695141065830721,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00039788833947809217,
|
|
"loss": 5.1727,
|
|
"mean_token_accuracy": 0.17165548503398895,
|
|
"num_tokens": 8282432.0,
|
|
"step": 4715
|
|
},
|
|
{
|
|
"entropy": 5.347471475601196,
|
|
"epoch": 3.699059561128527,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0003976362650917125,
|
|
"loss": 4.9954,
|
|
"mean_token_accuracy": 0.19145373702049256,
|
|
"num_tokens": 8290909.0,
|
|
"step": 4720
|
|
},
|
|
{
|
|
"entropy": 5.368044900894165,
|
|
"epoch": 3.7029780564263324,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00039738397153444264,
|
|
"loss": 5.1106,
|
|
"mean_token_accuracy": 0.18162070959806442,
|
|
"num_tokens": 8299581.0,
|
|
"step": 4725
|
|
},
|
|
{
|
|
"entropy": 5.46605544090271,
|
|
"epoch": 3.706896551724138,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0003971314592571719,
|
|
"loss": 5.1308,
|
|
"mean_token_accuracy": 0.1756805568933487,
|
|
"num_tokens": 8308184.0,
|
|
"step": 4730
|
|
},
|
|
{
|
|
"entropy": 5.377258920669556,
|
|
"epoch": 3.7108150470219434,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0003968787287111809,
|
|
"loss": 5.1026,
|
|
"mean_token_accuracy": 0.18265192806720734,
|
|
"num_tokens": 8317054.0,
|
|
"step": 4735
|
|
},
|
|
{
|
|
"entropy": 5.343409299850464,
|
|
"epoch": 3.714733542319749,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00039662578034814,
|
|
"loss": 5.1292,
|
|
"mean_token_accuracy": 0.18576941788196563,
|
|
"num_tokens": 8325961.0,
|
|
"step": 4740
|
|
},
|
|
{
|
|
"entropy": 5.552810716629028,
|
|
"epoch": 3.718652037617555,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00039637261462010886,
|
|
"loss": 5.2267,
|
|
"mean_token_accuracy": 0.17132796049118043,
|
|
"num_tokens": 8334401.0,
|
|
"step": 4745
|
|
},
|
|
{
|
|
"entropy": 5.4945728302001955,
|
|
"epoch": 3.7225705329153604,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0003961192319795358,
|
|
"loss": 5.1954,
|
|
"mean_token_accuracy": 0.1732421785593033,
|
|
"num_tokens": 8343786.0,
|
|
"step": 4750
|
|
},
|
|
{
|
|
"entropy": 5.4864644527435305,
|
|
"epoch": 3.726489028213166,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0003958656328792565,
|
|
"loss": 5.17,
|
|
"mean_token_accuracy": 0.1812288358807564,
|
|
"num_tokens": 8352286.0,
|
|
"step": 4755
|
|
},
|
|
{
|
|
"entropy": 5.339477300643921,
|
|
"epoch": 3.730407523510972,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00039561181777249396,
|
|
"loss": 5.0962,
|
|
"mean_token_accuracy": 0.18505542427301408,
|
|
"num_tokens": 8361274.0,
|
|
"step": 4760
|
|
},
|
|
{
|
|
"entropy": 5.387203741073608,
|
|
"epoch": 3.7343260188087775,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00039535778711285676,
|
|
"loss": 5.1225,
|
|
"mean_token_accuracy": 0.18194636851549148,
|
|
"num_tokens": 8369881.0,
|
|
"step": 4765
|
|
},
|
|
{
|
|
"entropy": 5.474074840545654,
|
|
"epoch": 3.738244514106583,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0003951035413543388,
|
|
"loss": 5.1981,
|
|
"mean_token_accuracy": 0.16484691351652145,
|
|
"num_tokens": 8378234.0,
|
|
"step": 4770
|
|
},
|
|
{
|
|
"entropy": 5.408027410507202,
|
|
"epoch": 3.742163009404389,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00039484908095131874,
|
|
"loss": 5.1144,
|
|
"mean_token_accuracy": 0.18274004459381105,
|
|
"num_tokens": 8387023.0,
|
|
"step": 4775
|
|
},
|
|
{
|
|
"entropy": 5.36602144241333,
|
|
"epoch": 3.7460815047021945,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0003945944063585582,
|
|
"loss": 5.0755,
|
|
"mean_token_accuracy": 0.1848648577928543,
|
|
"num_tokens": 8396110.0,
|
|
"step": 4780
|
|
},
|
|
{
|
|
"entropy": 5.341309881210327,
|
|
"epoch": 3.75,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00039433951803120225,
|
|
"loss": 5.0402,
|
|
"mean_token_accuracy": 0.1882338985800743,
|
|
"num_tokens": 8405219.0,
|
|
"step": 4785
|
|
},
|
|
{
|
|
"entropy": 5.499637079238892,
|
|
"epoch": 3.7539184952978055,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00039408441642477764,
|
|
"loss": 5.2427,
|
|
"mean_token_accuracy": 0.1696087598800659,
|
|
"num_tokens": 8413996.0,
|
|
"step": 4790
|
|
},
|
|
{
|
|
"entropy": 5.383800268173218,
|
|
"epoch": 3.757836990595611,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0003938291019951922,
|
|
"loss": 4.9861,
|
|
"mean_token_accuracy": 0.19548303335905076,
|
|
"num_tokens": 8421931.0,
|
|
"step": 4795
|
|
},
|
|
{
|
|
"entropy": 5.373687696456909,
|
|
"epoch": 3.761755485893417,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0003935735751987344,
|
|
"loss": 5.0839,
|
|
"mean_token_accuracy": 0.1746331587433815,
|
|
"num_tokens": 8430090.0,
|
|
"step": 4800
|
|
},
|
|
{
|
|
"entropy": 5.327823877334595,
|
|
"epoch": 3.7656739811912225,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00039331783649207175,
|
|
"loss": 5.0766,
|
|
"mean_token_accuracy": 0.17887697219848633,
|
|
"num_tokens": 8439311.0,
|
|
"step": 4805
|
|
},
|
|
{
|
|
"entropy": 5.389744234085083,
|
|
"epoch": 3.769592476489028,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00039306188633225097,
|
|
"loss": 5.1405,
|
|
"mean_token_accuracy": 0.1806069329380989,
|
|
"num_tokens": 8448952.0,
|
|
"step": 4810
|
|
},
|
|
{
|
|
"entropy": 5.394601345062256,
|
|
"epoch": 3.773510971786834,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0003928057251766965,
|
|
"loss": 5.0759,
|
|
"mean_token_accuracy": 0.1838828906416893,
|
|
"num_tokens": 8457177.0,
|
|
"step": 4815
|
|
},
|
|
{
|
|
"entropy": 5.355936098098755,
|
|
"epoch": 3.7774294670846396,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00039254935348320984,
|
|
"loss": 5.1241,
|
|
"mean_token_accuracy": 0.18029553443193436,
|
|
"num_tokens": 8465844.0,
|
|
"step": 4820
|
|
},
|
|
{
|
|
"entropy": 5.447889137268066,
|
|
"epoch": 3.781347962382445,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00039229277170996885,
|
|
"loss": 5.2206,
|
|
"mean_token_accuracy": 0.17226959466934205,
|
|
"num_tokens": 8475198.0,
|
|
"step": 4825
|
|
},
|
|
{
|
|
"entropy": 5.474216604232788,
|
|
"epoch": 3.785266457680251,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0003920359803155266,
|
|
"loss": 5.0949,
|
|
"mean_token_accuracy": 0.182598714530468,
|
|
"num_tokens": 8484050.0,
|
|
"step": 4830
|
|
},
|
|
{
|
|
"entropy": 5.325715017318726,
|
|
"epoch": 3.7891849529780566,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00039177897975881115,
|
|
"loss": 5.0895,
|
|
"mean_token_accuracy": 0.1856340140104294,
|
|
"num_tokens": 8491825.0,
|
|
"step": 4835
|
|
},
|
|
{
|
|
"entropy": 5.339238119125366,
|
|
"epoch": 3.793103448275862,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0003915217704991239,
|
|
"loss": 5.0818,
|
|
"mean_token_accuracy": 0.18506858348846436,
|
|
"num_tokens": 8500651.0,
|
|
"step": 4840
|
|
},
|
|
{
|
|
"entropy": 5.439371633529663,
|
|
"epoch": 3.7970219435736676,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0003912643529961397,
|
|
"loss": 5.0825,
|
|
"mean_token_accuracy": 0.1822042629122734,
|
|
"num_tokens": 8508464.0,
|
|
"step": 4845
|
|
},
|
|
{
|
|
"entropy": 5.415101051330566,
|
|
"epoch": 3.800940438871473,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0003910067277099053,
|
|
"loss": 5.1316,
|
|
"mean_token_accuracy": 0.18691204339265824,
|
|
"num_tokens": 8517676.0,
|
|
"step": 4850
|
|
},
|
|
{
|
|
"entropy": 5.458886194229126,
|
|
"epoch": 3.804858934169279,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00039074889510083894,
|
|
"loss": 5.1881,
|
|
"mean_token_accuracy": 0.1722204566001892,
|
|
"num_tokens": 8526926.0,
|
|
"step": 4855
|
|
},
|
|
{
|
|
"entropy": 5.37487382888794,
|
|
"epoch": 3.8087774294670846,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0003904908556297293,
|
|
"loss": 5.0596,
|
|
"mean_token_accuracy": 0.1804742068052292,
|
|
"num_tokens": 8535814.0,
|
|
"step": 4860
|
|
},
|
|
{
|
|
"entropy": 5.458344507217407,
|
|
"epoch": 3.81269592476489,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0003902326097577345,
|
|
"loss": 5.1719,
|
|
"mean_token_accuracy": 0.17721525430679322,
|
|
"num_tokens": 8544814.0,
|
|
"step": 4865
|
|
},
|
|
{
|
|
"entropy": 5.417445516586303,
|
|
"epoch": 3.816614420062696,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00038997415794638206,
|
|
"loss": 5.1667,
|
|
"mean_token_accuracy": 0.17812894880771638,
|
|
"num_tokens": 8553455.0,
|
|
"step": 4870
|
|
},
|
|
{
|
|
"entropy": 5.3424866676330565,
|
|
"epoch": 3.8205329153605017,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0003897155006575672,
|
|
"loss": 5.0781,
|
|
"mean_token_accuracy": 0.1781342074275017,
|
|
"num_tokens": 8562048.0,
|
|
"step": 4875
|
|
},
|
|
{
|
|
"entropy": 5.407690954208374,
|
|
"epoch": 3.824451410658307,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00038945663835355247,
|
|
"loss": 5.1219,
|
|
"mean_token_accuracy": 0.1807398185133934,
|
|
"num_tokens": 8571855.0,
|
|
"step": 4880
|
|
},
|
|
{
|
|
"entropy": 5.394909906387329,
|
|
"epoch": 3.8283699059561127,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00038919757149696665,
|
|
"loss": 5.1129,
|
|
"mean_token_accuracy": 0.17730758637189864,
|
|
"num_tokens": 8580425.0,
|
|
"step": 4885
|
|
},
|
|
{
|
|
"entropy": 5.411207771301269,
|
|
"epoch": 3.8322884012539182,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00038893830055080437,
|
|
"loss": 5.1428,
|
|
"mean_token_accuracy": 0.18344798386096955,
|
|
"num_tokens": 8589751.0,
|
|
"step": 4890
|
|
},
|
|
{
|
|
"entropy": 5.435813570022583,
|
|
"epoch": 3.836206896551724,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0003886788259784248,
|
|
"loss": 5.1656,
|
|
"mean_token_accuracy": 0.17248253524303436,
|
|
"num_tokens": 8598077.0,
|
|
"step": 4895
|
|
},
|
|
{
|
|
"entropy": 5.494762659072876,
|
|
"epoch": 3.8401253918495297,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00038841914824355093,
|
|
"loss": 5.1758,
|
|
"mean_token_accuracy": 0.17547922879457473,
|
|
"num_tokens": 8606691.0,
|
|
"step": 4900
|
|
},
|
|
{
|
|
"entropy": 5.3925032138824465,
|
|
"epoch": 3.8440438871473352,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00038815926781026914,
|
|
"loss": 5.1202,
|
|
"mean_token_accuracy": 0.17767349481582642,
|
|
"num_tokens": 8615477.0,
|
|
"step": 4905
|
|
},
|
|
{
|
|
"entropy": 5.366964483261109,
|
|
"epoch": 3.847962382445141,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0003878991851430279,
|
|
"loss": 5.1388,
|
|
"mean_token_accuracy": 0.18286672681570054,
|
|
"num_tokens": 8623982.0,
|
|
"step": 4910
|
|
},
|
|
{
|
|
"entropy": 5.405296802520752,
|
|
"epoch": 3.8518808777429467,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0003876389007066371,
|
|
"loss": 5.1606,
|
|
"mean_token_accuracy": 0.17795273512601853,
|
|
"num_tokens": 8633017.0,
|
|
"step": 4915
|
|
},
|
|
{
|
|
"entropy": 5.375383281707764,
|
|
"epoch": 3.8557993730407523,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0003873784149662672,
|
|
"loss": 5.1049,
|
|
"mean_token_accuracy": 0.18141486793756484,
|
|
"num_tokens": 8642299.0,
|
|
"step": 4920
|
|
},
|
|
{
|
|
"entropy": 5.348470163345337,
|
|
"epoch": 3.8597178683385582,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0003871177283874484,
|
|
"loss": 5.0299,
|
|
"mean_token_accuracy": 0.18119459301233293,
|
|
"num_tokens": 8651008.0,
|
|
"step": 4925
|
|
},
|
|
{
|
|
"entropy": 5.368545627593994,
|
|
"epoch": 3.8636363636363638,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00038685684143606995,
|
|
"loss": 5.123,
|
|
"mean_token_accuracy": 0.17978052347898482,
|
|
"num_tokens": 8660103.0,
|
|
"step": 4930
|
|
},
|
|
{
|
|
"entropy": 5.316098833084107,
|
|
"epoch": 3.8675548589341693,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0003865957545783791,
|
|
"loss": 5.076,
|
|
"mean_token_accuracy": 0.1843046337366104,
|
|
"num_tokens": 8668646.0,
|
|
"step": 4935
|
|
},
|
|
{
|
|
"entropy": 5.400100469589233,
|
|
"epoch": 3.871473354231975,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00038633446828098046,
|
|
"loss": 5.2193,
|
|
"mean_token_accuracy": 0.17444987893104552,
|
|
"num_tokens": 8678190.0,
|
|
"step": 4940
|
|
},
|
|
{
|
|
"entropy": 5.516893720626831,
|
|
"epoch": 3.8753918495297803,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.000386072983010835,
|
|
"loss": 5.2352,
|
|
"mean_token_accuracy": 0.1684794768691063,
|
|
"num_tokens": 8687336.0,
|
|
"step": 4945
|
|
},
|
|
{
|
|
"entropy": 5.461493587493896,
|
|
"epoch": 3.8793103448275863,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00038581129923525914,
|
|
"loss": 5.1245,
|
|
"mean_token_accuracy": 0.18254335820674897,
|
|
"num_tokens": 8695939.0,
|
|
"step": 4950
|
|
},
|
|
{
|
|
"entropy": 5.378901958465576,
|
|
"epoch": 3.883228840125392,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00038554941742192445,
|
|
"loss": 5.166,
|
|
"mean_token_accuracy": 0.18020853847265245,
|
|
"num_tokens": 8704828.0,
|
|
"step": 4955
|
|
},
|
|
{
|
|
"entropy": 5.441786289215088,
|
|
"epoch": 3.8871473354231973,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0003852873380388561,
|
|
"loss": 5.0914,
|
|
"mean_token_accuracy": 0.19002858847379683,
|
|
"num_tokens": 8714427.0,
|
|
"step": 4960
|
|
},
|
|
{
|
|
"entropy": 5.476487874984741,
|
|
"epoch": 3.8910658307210033,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0003850250615544323,
|
|
"loss": 5.1655,
|
|
"mean_token_accuracy": 0.18157008439302444,
|
|
"num_tokens": 8723737.0,
|
|
"step": 4965
|
|
},
|
|
{
|
|
"entropy": 5.34540548324585,
|
|
"epoch": 3.894984326018809,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00038476258843738386,
|
|
"loss": 5.1234,
|
|
"mean_token_accuracy": 0.18455416560173035,
|
|
"num_tokens": 8733149.0,
|
|
"step": 4970
|
|
},
|
|
{
|
|
"entropy": 5.338756895065307,
|
|
"epoch": 3.8989028213166144,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00038449991915679273,
|
|
"loss": 5.0862,
|
|
"mean_token_accuracy": 0.18745338916778564,
|
|
"num_tokens": 8742000.0,
|
|
"step": 4975
|
|
},
|
|
{
|
|
"entropy": 5.400898027420044,
|
|
"epoch": 3.9028213166144203,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0003842370541820915,
|
|
"loss": 5.1547,
|
|
"mean_token_accuracy": 0.1786161720752716,
|
|
"num_tokens": 8751069.0,
|
|
"step": 4980
|
|
},
|
|
{
|
|
"entropy": 5.414169979095459,
|
|
"epoch": 3.906739811912226,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00038397399398306243,
|
|
"loss": 5.0647,
|
|
"mean_token_accuracy": 0.18002667427062988,
|
|
"num_tokens": 8760092.0,
|
|
"step": 4985
|
|
},
|
|
{
|
|
"entropy": 5.392027568817139,
|
|
"epoch": 3.9106583072100314,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00038371073902983684,
|
|
"loss": 5.0915,
|
|
"mean_token_accuracy": 0.18434868305921553,
|
|
"num_tokens": 8768039.0,
|
|
"step": 4990
|
|
},
|
|
{
|
|
"entropy": 5.399258708953857,
|
|
"epoch": 3.914576802507837,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.000383447289792894,
|
|
"loss": 5.1307,
|
|
"mean_token_accuracy": 0.17574300169944762,
|
|
"num_tokens": 8776630.0,
|
|
"step": 4995
|
|
},
|
|
{
|
|
"entropy": 5.417212390899659,
|
|
"epoch": 3.9184952978056424,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00038318364674306036,
|
|
"loss": 5.081,
|
|
"mean_token_accuracy": 0.18248913884162904,
|
|
"num_tokens": 8785272.0,
|
|
"step": 5000
|
|
},
|
|
{
|
|
"epoch": 3.9184952978056424,
|
|
"eval_entropy": 5.3187168975209085,
|
|
"eval_loss": 5.745798110961914,
|
|
"eval_mean_token_accuracy": 0.16200539206927136,
|
|
"eval_num_tokens": 8785272.0,
|
|
"eval_runtime": 2.843,
|
|
"eval_samples_per_second": 1449.89,
|
|
"eval_steps_per_second": 181.5,
|
|
"step": 5000
|
|
},
|
|
{
|
|
"entropy": 5.414234256744384,
|
|
"epoch": 3.9224137931034484,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00038291981035150883,
|
|
"loss": 5.1083,
|
|
"mean_token_accuracy": 0.18136782944202423,
|
|
"num_tokens": 8794197.0,
|
|
"step": 5005
|
|
},
|
|
{
|
|
"entropy": 5.408847141265869,
|
|
"epoch": 3.926332288401254,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0003826557810897579,
|
|
"loss": 5.1369,
|
|
"mean_token_accuracy": 0.17788099348545075,
|
|
"num_tokens": 8803195.0,
|
|
"step": 5010
|
|
},
|
|
{
|
|
"entropy": 5.396364307403564,
|
|
"epoch": 3.9302507836990594,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0003823915594296705,
|
|
"loss": 5.0938,
|
|
"mean_token_accuracy": 0.18403008282184602,
|
|
"num_tokens": 8811772.0,
|
|
"step": 5015
|
|
},
|
|
{
|
|
"entropy": 5.265464973449707,
|
|
"epoch": 3.9341692789968654,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0003821271458434538,
|
|
"loss": 5.0723,
|
|
"mean_token_accuracy": 0.1878345564007759,
|
|
"num_tokens": 8820482.0,
|
|
"step": 5020
|
|
},
|
|
{
|
|
"entropy": 5.374425792694092,
|
|
"epoch": 3.938087774294671,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00038186254080365756,
|
|
"loss": 5.1502,
|
|
"mean_token_accuracy": 0.1756037726998329,
|
|
"num_tokens": 8828761.0,
|
|
"step": 5025
|
|
},
|
|
{
|
|
"entropy": 5.351945161819458,
|
|
"epoch": 3.9420062695924765,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.000381597744783174,
|
|
"loss": 5.0315,
|
|
"mean_token_accuracy": 0.18276280909776688,
|
|
"num_tokens": 8837165.0,
|
|
"step": 5030
|
|
},
|
|
{
|
|
"entropy": 5.379795837402344,
|
|
"epoch": 3.945924764890282,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00038133275825523645,
|
|
"loss": 5.1118,
|
|
"mean_token_accuracy": 0.18035837560892104,
|
|
"num_tokens": 8845472.0,
|
|
"step": 5035
|
|
},
|
|
{
|
|
"entropy": 5.405280685424804,
|
|
"epoch": 3.9498432601880875,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00038106758169341887,
|
|
"loss": 5.102,
|
|
"mean_token_accuracy": 0.18403379768133163,
|
|
"num_tokens": 8855246.0,
|
|
"step": 5040
|
|
},
|
|
{
|
|
"entropy": 5.4769899368286135,
|
|
"epoch": 3.9537617554858935,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0003808022155716348,
|
|
"loss": 5.1747,
|
|
"mean_token_accuracy": 0.1827925354242325,
|
|
"num_tokens": 8863667.0,
|
|
"step": 5045
|
|
},
|
|
{
|
|
"entropy": 5.351367425918579,
|
|
"epoch": 3.957680250783699,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0003805366603641364,
|
|
"loss": 5.0128,
|
|
"mean_token_accuracy": 0.19337196052074432,
|
|
"num_tokens": 8872306.0,
|
|
"step": 5050
|
|
},
|
|
{
|
|
"entropy": 5.3284947872161865,
|
|
"epoch": 3.9615987460815045,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00038027091654551406,
|
|
"loss": 5.057,
|
|
"mean_token_accuracy": 0.18368325531482696,
|
|
"num_tokens": 8882037.0,
|
|
"step": 5055
|
|
},
|
|
{
|
|
"entropy": 5.311836004257202,
|
|
"epoch": 3.9655172413793105,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00038000498459069487,
|
|
"loss": 5.0921,
|
|
"mean_token_accuracy": 0.18096426427364348,
|
|
"num_tokens": 8890685.0,
|
|
"step": 5060
|
|
},
|
|
{
|
|
"entropy": 5.350221347808838,
|
|
"epoch": 3.969435736677116,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0003797388649749426,
|
|
"loss": 5.0877,
|
|
"mean_token_accuracy": 0.18418147265911103,
|
|
"num_tokens": 8899927.0,
|
|
"step": 5065
|
|
},
|
|
{
|
|
"entropy": 5.397369432449341,
|
|
"epoch": 3.9733542319749215,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0003794725581738559,
|
|
"loss": 5.1223,
|
|
"mean_token_accuracy": 0.18000091165304183,
|
|
"num_tokens": 8908571.0,
|
|
"step": 5070
|
|
},
|
|
{
|
|
"entropy": 5.36154465675354,
|
|
"epoch": 3.9772727272727275,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00037920606466336834,
|
|
"loss": 5.0282,
|
|
"mean_token_accuracy": 0.1855129450559616,
|
|
"num_tokens": 8917246.0,
|
|
"step": 5075
|
|
},
|
|
{
|
|
"entropy": 5.3474297523498535,
|
|
"epoch": 3.981191222570533,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000378939384919747,
|
|
"loss": 5.1311,
|
|
"mean_token_accuracy": 0.1813666984438896,
|
|
"num_tokens": 8926116.0,
|
|
"step": 5080
|
|
},
|
|
{
|
|
"entropy": 5.423180866241455,
|
|
"epoch": 3.9851097178683386,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0003786725194195918,
|
|
"loss": 5.1708,
|
|
"mean_token_accuracy": 0.1746719792485237,
|
|
"num_tokens": 8934914.0,
|
|
"step": 5085
|
|
},
|
|
{
|
|
"entropy": 5.4457080364227295,
|
|
"epoch": 3.989028213166144,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00037840546863983484,
|
|
"loss": 5.0585,
|
|
"mean_token_accuracy": 0.18123985528945924,
|
|
"num_tokens": 8943321.0,
|
|
"step": 5090
|
|
},
|
|
{
|
|
"entropy": 5.346265506744385,
|
|
"epoch": 3.9929467084639496,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00037813823305773883,
|
|
"loss": 5.1302,
|
|
"mean_token_accuracy": 0.1808217704296112,
|
|
"num_tokens": 8951848.0,
|
|
"step": 5095
|
|
},
|
|
{
|
|
"entropy": 5.394402647018433,
|
|
"epoch": 3.9968652037617556,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0003778708131508974,
|
|
"loss": 5.11,
|
|
"mean_token_accuracy": 0.18120819330215454,
|
|
"num_tokens": 8961291.0,
|
|
"step": 5100
|
|
},
|
|
{
|
|
"entropy": 5.469988012313843,
|
|
"epoch": 4.0007836990595615,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00037760320939723307,
|
|
"loss": 5.1504,
|
|
"mean_token_accuracy": 0.1797046482563019,
|
|
"num_tokens": 8971116.0,
|
|
"step": 5105
|
|
},
|
|
{
|
|
"entropy": 5.356029987335205,
|
|
"epoch": 4.004702194357367,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00037733542227499727,
|
|
"loss": 4.7718,
|
|
"mean_token_accuracy": 0.20312984436750411,
|
|
"num_tokens": 8979362.0,
|
|
"step": 5110
|
|
},
|
|
{
|
|
"entropy": 5.430681324005127,
|
|
"epoch": 4.008620689655173,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00037706745226276893,
|
|
"loss": 4.9455,
|
|
"mean_token_accuracy": 0.19273480027914047,
|
|
"num_tokens": 8987728.0,
|
|
"step": 5115
|
|
},
|
|
{
|
|
"entropy": 5.399423599243164,
|
|
"epoch": 4.012539184952978,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0003767992998394539,
|
|
"loss": 4.7894,
|
|
"mean_token_accuracy": 0.19832758903503417,
|
|
"num_tokens": 8995817.0,
|
|
"step": 5120
|
|
},
|
|
{
|
|
"entropy": 5.328368282318115,
|
|
"epoch": 4.016457680250784,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0003765309654842842,
|
|
"loss": 4.8543,
|
|
"mean_token_accuracy": 0.19569939374923706,
|
|
"num_tokens": 9005683.0,
|
|
"step": 5125
|
|
},
|
|
{
|
|
"entropy": 5.275219869613648,
|
|
"epoch": 4.02037617554859,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0003762624496768166,
|
|
"loss": 4.7808,
|
|
"mean_token_accuracy": 0.20220271348953248,
|
|
"num_tokens": 9015408.0,
|
|
"step": 5130
|
|
},
|
|
{
|
|
"entropy": 5.480678939819336,
|
|
"epoch": 4.024294670846395,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0003759937528969325,
|
|
"loss": 4.8989,
|
|
"mean_token_accuracy": 0.1920560657978058,
|
|
"num_tokens": 9024381.0,
|
|
"step": 5135
|
|
},
|
|
{
|
|
"entropy": 5.34601058959961,
|
|
"epoch": 4.028213166144201,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00037572487562483666,
|
|
"loss": 4.7963,
|
|
"mean_token_accuracy": 0.20266074538230897,
|
|
"num_tokens": 9033092.0,
|
|
"step": 5140
|
|
},
|
|
{
|
|
"entropy": 5.311284494400025,
|
|
"epoch": 4.032131661442007,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00037545581834105623,
|
|
"loss": 4.9295,
|
|
"mean_token_accuracy": 0.19702926874160767,
|
|
"num_tokens": 9042251.0,
|
|
"step": 5145
|
|
},
|
|
{
|
|
"entropy": 5.345129108428955,
|
|
"epoch": 4.036050156739812,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00037518658152644035,
|
|
"loss": 4.8002,
|
|
"mean_token_accuracy": 0.19346252232789993,
|
|
"num_tokens": 9050628.0,
|
|
"step": 5150
|
|
},
|
|
{
|
|
"entropy": 5.366067218780517,
|
|
"epoch": 4.039968652037618,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0003749171656621588,
|
|
"loss": 4.8367,
|
|
"mean_token_accuracy": 0.21011017262935638,
|
|
"num_tokens": 9060080.0,
|
|
"step": 5155
|
|
},
|
|
{
|
|
"entropy": 5.3667596817016605,
|
|
"epoch": 4.043887147335423,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0003746475712297014,
|
|
"loss": 4.844,
|
|
"mean_token_accuracy": 0.19992998540401458,
|
|
"num_tokens": 9069456.0,
|
|
"step": 5160
|
|
},
|
|
{
|
|
"entropy": 5.377957153320312,
|
|
"epoch": 4.047805642633229,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0003743777987108771,
|
|
"loss": 4.8466,
|
|
"mean_token_accuracy": 0.2019576370716095,
|
|
"num_tokens": 9078790.0,
|
|
"step": 5165
|
|
},
|
|
{
|
|
"entropy": 5.2944563865661625,
|
|
"epoch": 4.051724137931035,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0003741078485878132,
|
|
"loss": 4.8521,
|
|
"mean_token_accuracy": 0.19751807302236557,
|
|
"num_tokens": 9086888.0,
|
|
"step": 5170
|
|
},
|
|
{
|
|
"entropy": 5.344511365890503,
|
|
"epoch": 4.05564263322884,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0003738377213429542,
|
|
"loss": 4.8942,
|
|
"mean_token_accuracy": 0.18725551962852477,
|
|
"num_tokens": 9095139.0,
|
|
"step": 5175
|
|
},
|
|
{
|
|
"entropy": 5.365310335159302,
|
|
"epoch": 4.059561128526646,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0003735674174590614,
|
|
"loss": 4.7833,
|
|
"mean_token_accuracy": 0.19850707948207855,
|
|
"num_tokens": 9103464.0,
|
|
"step": 5180
|
|
},
|
|
{
|
|
"entropy": 5.341140699386597,
|
|
"epoch": 4.063479623824452,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00037329693741921166,
|
|
"loss": 4.8065,
|
|
"mean_token_accuracy": 0.19978338330984116,
|
|
"num_tokens": 9111290.0,
|
|
"step": 5185
|
|
},
|
|
{
|
|
"entropy": 5.222963809967041,
|
|
"epoch": 4.067398119122257,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0003730262817067967,
|
|
"loss": 4.6724,
|
|
"mean_token_accuracy": 0.20734440684318542,
|
|
"num_tokens": 9120224.0,
|
|
"step": 5190
|
|
},
|
|
{
|
|
"entropy": 5.360689163208008,
|
|
"epoch": 4.071316614420063,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0003727554508055221,
|
|
"loss": 4.8433,
|
|
"mean_token_accuracy": 0.19537173509597777,
|
|
"num_tokens": 9128524.0,
|
|
"step": 5195
|
|
},
|
|
{
|
|
"entropy": 5.283273792266845,
|
|
"epoch": 4.075235109717869,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0003724844451994066,
|
|
"loss": 4.721,
|
|
"mean_token_accuracy": 0.20277249068021774,
|
|
"num_tokens": 9137026.0,
|
|
"step": 5200
|
|
},
|
|
{
|
|
"entropy": 5.300542259216309,
|
|
"epoch": 4.079153605015674,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00037221326537278113,
|
|
"loss": 4.8016,
|
|
"mean_token_accuracy": 0.2002988949418068,
|
|
"num_tokens": 9146312.0,
|
|
"step": 5205
|
|
},
|
|
{
|
|
"entropy": 5.311707401275635,
|
|
"epoch": 4.08307210031348,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00037194191181028806,
|
|
"loss": 4.8341,
|
|
"mean_token_accuracy": 0.19669432789087296,
|
|
"num_tokens": 9155226.0,
|
|
"step": 5210
|
|
},
|
|
{
|
|
"entropy": 5.393960428237915,
|
|
"epoch": 4.086990595611285,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00037167038499688024,
|
|
"loss": 4.979,
|
|
"mean_token_accuracy": 0.19061197191476822,
|
|
"num_tokens": 9164090.0,
|
|
"step": 5215
|
|
},
|
|
{
|
|
"entropy": 5.354083108901977,
|
|
"epoch": 4.090909090909091,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00037139868541782,
|
|
"loss": 4.8064,
|
|
"mean_token_accuracy": 0.20571322739124298,
|
|
"num_tokens": 9173002.0,
|
|
"step": 5220
|
|
},
|
|
{
|
|
"entropy": 5.350349712371826,
|
|
"epoch": 4.094827586206897,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0003711268135586787,
|
|
"loss": 4.837,
|
|
"mean_token_accuracy": 0.19182703793048858,
|
|
"num_tokens": 9182020.0,
|
|
"step": 5225
|
|
},
|
|
{
|
|
"entropy": 5.277394104003906,
|
|
"epoch": 4.098746081504702,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00037085476990533523,
|
|
"loss": 4.8569,
|
|
"mean_token_accuracy": 0.19545716941356658,
|
|
"num_tokens": 9190848.0,
|
|
"step": 5230
|
|
},
|
|
{
|
|
"entropy": 5.297388744354248,
|
|
"epoch": 4.102664576802508,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0003705825549439759,
|
|
"loss": 4.7846,
|
|
"mean_token_accuracy": 0.2089281588792801,
|
|
"num_tokens": 9198808.0,
|
|
"step": 5235
|
|
},
|
|
{
|
|
"entropy": 5.413204908370972,
|
|
"epoch": 4.106583072100314,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00037031016916109295,
|
|
"loss": 4.9121,
|
|
"mean_token_accuracy": 0.19246796071529387,
|
|
"num_tokens": 9207923.0,
|
|
"step": 5240
|
|
},
|
|
{
|
|
"entropy": 5.341586589813232,
|
|
"epoch": 4.110501567398119,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00037003761304348374,
|
|
"loss": 4.8633,
|
|
"mean_token_accuracy": 0.19748846739530562,
|
|
"num_tokens": 9216759.0,
|
|
"step": 5245
|
|
},
|
|
{
|
|
"entropy": 5.278125047683716,
|
|
"epoch": 4.114420062695925,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00036976488707825056,
|
|
"loss": 4.7931,
|
|
"mean_token_accuracy": 0.19396317452192308,
|
|
"num_tokens": 9225951.0,
|
|
"step": 5250
|
|
},
|
|
{
|
|
"entropy": 5.303081369400024,
|
|
"epoch": 4.118338557993731,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00036949199175279876,
|
|
"loss": 4.8625,
|
|
"mean_token_accuracy": 0.1928597003221512,
|
|
"num_tokens": 9234413.0,
|
|
"step": 5255
|
|
},
|
|
{
|
|
"entropy": 5.386286067962646,
|
|
"epoch": 4.122257053291536,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0003692189275548364,
|
|
"loss": 4.9949,
|
|
"mean_token_accuracy": 0.18975718915462494,
|
|
"num_tokens": 9244002.0,
|
|
"step": 5260
|
|
},
|
|
{
|
|
"entropy": 5.408260679244995,
|
|
"epoch": 4.126175548589342,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0003689456949723737,
|
|
"loss": 4.8336,
|
|
"mean_token_accuracy": 0.19539573788642883,
|
|
"num_tokens": 9252689.0,
|
|
"step": 5265
|
|
},
|
|
{
|
|
"entropy": 5.394720888137817,
|
|
"epoch": 4.130094043887147,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0003686722944937215,
|
|
"loss": 4.8256,
|
|
"mean_token_accuracy": 0.19857707023620605,
|
|
"num_tokens": 9261118.0,
|
|
"step": 5270
|
|
},
|
|
{
|
|
"entropy": 5.265163803100586,
|
|
"epoch": 4.134012539184953,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00036839872660749084,
|
|
"loss": 4.8262,
|
|
"mean_token_accuracy": 0.19689211696386338,
|
|
"num_tokens": 9269205.0,
|
|
"step": 5275
|
|
},
|
|
{
|
|
"entropy": 5.289034748077393,
|
|
"epoch": 4.137931034482759,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0003681249918025918,
|
|
"loss": 4.8346,
|
|
"mean_token_accuracy": 0.19611582905054092,
|
|
"num_tokens": 9277770.0,
|
|
"step": 5280
|
|
},
|
|
{
|
|
"entropy": 5.303685474395752,
|
|
"epoch": 4.141849529780564,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00036785109056823297,
|
|
"loss": 4.871,
|
|
"mean_token_accuracy": 0.19228526800870896,
|
|
"num_tokens": 9286589.0,
|
|
"step": 5285
|
|
},
|
|
{
|
|
"entropy": 5.358338928222656,
|
|
"epoch": 4.14576802507837,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00036757702339392,
|
|
"loss": 4.849,
|
|
"mean_token_accuracy": 0.1935323506593704,
|
|
"num_tokens": 9295264.0,
|
|
"step": 5290
|
|
},
|
|
{
|
|
"entropy": 5.300791883468628,
|
|
"epoch": 4.149686520376176,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00036730279076945574,
|
|
"loss": 4.8562,
|
|
"mean_token_accuracy": 0.19880399703979493,
|
|
"num_tokens": 9303525.0,
|
|
"step": 5295
|
|
},
|
|
{
|
|
"entropy": 5.304425001144409,
|
|
"epoch": 4.153605015673981,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0003670283931849382,
|
|
"loss": 4.919,
|
|
"mean_token_accuracy": 0.1852765128016472,
|
|
"num_tokens": 9312839.0,
|
|
"step": 5300
|
|
},
|
|
{
|
|
"entropy": 5.327575397491455,
|
|
"epoch": 4.157523510971787,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00036675383113076023,
|
|
"loss": 4.8525,
|
|
"mean_token_accuracy": 0.19230640381574632,
|
|
"num_tokens": 9321462.0,
|
|
"step": 5305
|
|
},
|
|
{
|
|
"entropy": 5.294164800643921,
|
|
"epoch": 4.161442006269592,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.00036647910509760896,
|
|
"loss": 4.8788,
|
|
"mean_token_accuracy": 0.19457252472639083,
|
|
"num_tokens": 9329315.0,
|
|
"step": 5310
|
|
},
|
|
{
|
|
"entropy": 5.352940511703491,
|
|
"epoch": 4.165360501567398,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00036620421557646414,
|
|
"loss": 4.8288,
|
|
"mean_token_accuracy": 0.1951540604233742,
|
|
"num_tokens": 9336997.0,
|
|
"step": 5315
|
|
},
|
|
{
|
|
"entropy": 5.286109972000122,
|
|
"epoch": 4.169278996865204,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.000365929163058598,
|
|
"loss": 4.7964,
|
|
"mean_token_accuracy": 0.20091632902622222,
|
|
"num_tokens": 9345364.0,
|
|
"step": 5320
|
|
},
|
|
{
|
|
"entropy": 5.301588249206543,
|
|
"epoch": 4.173197492163009,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0003656539480355741,
|
|
"loss": 4.8085,
|
|
"mean_token_accuracy": 0.20134562849998475,
|
|
"num_tokens": 9353961.0,
|
|
"step": 5325
|
|
},
|
|
{
|
|
"entropy": 5.412155055999756,
|
|
"epoch": 4.177115987460815,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0003653785709992462,
|
|
"loss": 4.9702,
|
|
"mean_token_accuracy": 0.1869103878736496,
|
|
"num_tokens": 9362901.0,
|
|
"step": 5330
|
|
},
|
|
{
|
|
"entropy": 5.333846664428711,
|
|
"epoch": 4.181034482758621,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00036510303244175775,
|
|
"loss": 4.8612,
|
|
"mean_token_accuracy": 0.19535253793001175,
|
|
"num_tokens": 9371800.0,
|
|
"step": 5335
|
|
},
|
|
{
|
|
"entropy": 5.287358522415161,
|
|
"epoch": 4.184952978056426,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00036482733285554073,
|
|
"loss": 4.8705,
|
|
"mean_token_accuracy": 0.20408109128475188,
|
|
"num_tokens": 9380033.0,
|
|
"step": 5340
|
|
},
|
|
{
|
|
"entropy": 5.4015497207641605,
|
|
"epoch": 4.188871473354232,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00036455147273331507,
|
|
"loss": 4.9269,
|
|
"mean_token_accuracy": 0.19432390630245208,
|
|
"num_tokens": 9389264.0,
|
|
"step": 5345
|
|
},
|
|
{
|
|
"entropy": 5.320134592056275,
|
|
"epoch": 4.192789968652038,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0003642754525680876,
|
|
"loss": 4.8445,
|
|
"mean_token_accuracy": 0.2027723029255867,
|
|
"num_tokens": 9398112.0,
|
|
"step": 5350
|
|
},
|
|
{
|
|
"entropy": 5.361209726333618,
|
|
"epoch": 4.196708463949843,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00036399927285315104,
|
|
"loss": 4.9705,
|
|
"mean_token_accuracy": 0.18877116441726685,
|
|
"num_tokens": 9407400.0,
|
|
"step": 5355
|
|
},
|
|
{
|
|
"entropy": 5.342429494857788,
|
|
"epoch": 4.200626959247649,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0003637229340820834,
|
|
"loss": 4.8583,
|
|
"mean_token_accuracy": 0.18957713395357131,
|
|
"num_tokens": 9416351.0,
|
|
"step": 5360
|
|
},
|
|
{
|
|
"entropy": 5.4186546325683596,
|
|
"epoch": 4.204545454545454,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0003634464367487467,
|
|
"loss": 4.88,
|
|
"mean_token_accuracy": 0.19252870678901673,
|
|
"num_tokens": 9424577.0,
|
|
"step": 5365
|
|
},
|
|
{
|
|
"entropy": 5.32326488494873,
|
|
"epoch": 4.20846394984326,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0003631697813472867,
|
|
"loss": 4.8794,
|
|
"mean_token_accuracy": 0.20065970718860626,
|
|
"num_tokens": 9434242.0,
|
|
"step": 5370
|
|
},
|
|
{
|
|
"entropy": 5.263705587387085,
|
|
"epoch": 4.212382445141066,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00036289296837213136,
|
|
"loss": 4.837,
|
|
"mean_token_accuracy": 0.19743873178958893,
|
|
"num_tokens": 9443247.0,
|
|
"step": 5375
|
|
},
|
|
{
|
|
"entropy": 5.284544992446899,
|
|
"epoch": 4.216300940438871,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00036261599831799034,
|
|
"loss": 4.7943,
|
|
"mean_token_accuracy": 0.19877972900867463,
|
|
"num_tokens": 9451998.0,
|
|
"step": 5380
|
|
},
|
|
{
|
|
"entropy": 5.350458097457886,
|
|
"epoch": 4.220219435736677,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0003623388716798541,
|
|
"loss": 4.7761,
|
|
"mean_token_accuracy": 0.1993638679385185,
|
|
"num_tokens": 9459993.0,
|
|
"step": 5385
|
|
},
|
|
{
|
|
"entropy": 5.314915370941162,
|
|
"epoch": 4.224137931034483,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00036206158895299277,
|
|
"loss": 4.8108,
|
|
"mean_token_accuracy": 0.19984694570302963,
|
|
"num_tokens": 9468036.0,
|
|
"step": 5390
|
|
},
|
|
{
|
|
"entropy": 5.232047080993652,
|
|
"epoch": 4.228056426332288,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00036178415063295574,
|
|
"loss": 4.891,
|
|
"mean_token_accuracy": 0.19691408127546312,
|
|
"num_tokens": 9477031.0,
|
|
"step": 5395
|
|
},
|
|
{
|
|
"entropy": 5.320366287231446,
|
|
"epoch": 4.231974921630094,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00036150655721557,
|
|
"loss": 4.8016,
|
|
"mean_token_accuracy": 0.19657075256109238,
|
|
"num_tokens": 9485687.0,
|
|
"step": 5400
|
|
},
|
|
{
|
|
"entropy": 5.305319166183471,
|
|
"epoch": 4.2358934169279,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0003612288091969402,
|
|
"loss": 4.8434,
|
|
"mean_token_accuracy": 0.2044542208313942,
|
|
"num_tokens": 9494613.0,
|
|
"step": 5405
|
|
},
|
|
{
|
|
"entropy": 5.330721235275268,
|
|
"epoch": 4.239811912225705,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00036095090707344697,
|
|
"loss": 4.8422,
|
|
"mean_token_accuracy": 0.1964661344885826,
|
|
"num_tokens": 9503522.0,
|
|
"step": 5410
|
|
},
|
|
{
|
|
"entropy": 5.2977745056152346,
|
|
"epoch": 4.243730407523511,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00036067285134174654,
|
|
"loss": 4.877,
|
|
"mean_token_accuracy": 0.19300895184278488,
|
|
"num_tokens": 9512293.0,
|
|
"step": 5415
|
|
},
|
|
{
|
|
"entropy": 5.416642761230468,
|
|
"epoch": 4.247648902821316,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0003603946424987696,
|
|
"loss": 4.9687,
|
|
"mean_token_accuracy": 0.18967139422893525,
|
|
"num_tokens": 9520999.0,
|
|
"step": 5420
|
|
},
|
|
{
|
|
"entropy": 5.350950431823731,
|
|
"epoch": 4.251567398119122,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00036011628104172045,
|
|
"loss": 4.9082,
|
|
"mean_token_accuracy": 0.19185323864221573,
|
|
"num_tokens": 9529798.0,
|
|
"step": 5425
|
|
},
|
|
{
|
|
"entropy": 5.384712743759155,
|
|
"epoch": 4.255485893416928,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00035983776746807613,
|
|
"loss": 4.8423,
|
|
"mean_token_accuracy": 0.20016718059778213,
|
|
"num_tokens": 9538668.0,
|
|
"step": 5430
|
|
},
|
|
{
|
|
"entropy": 5.371892261505127,
|
|
"epoch": 4.259404388714733,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0003595591022755855,
|
|
"loss": 4.8935,
|
|
"mean_token_accuracy": 0.1966972291469574,
|
|
"num_tokens": 9546973.0,
|
|
"step": 5435
|
|
},
|
|
{
|
|
"entropy": 5.262534809112549,
|
|
"epoch": 4.263322884012539,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0003592802859622684,
|
|
"loss": 4.8249,
|
|
"mean_token_accuracy": 0.19144313931465148,
|
|
"num_tokens": 9555947.0,
|
|
"step": 5440
|
|
},
|
|
{
|
|
"entropy": 5.291714668273926,
|
|
"epoch": 4.267241379310345,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.000359001319026415,
|
|
"loss": 4.8971,
|
|
"mean_token_accuracy": 0.19541571140289307,
|
|
"num_tokens": 9564730.0,
|
|
"step": 5445
|
|
},
|
|
{
|
|
"entropy": 5.251865196228027,
|
|
"epoch": 4.27115987460815,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00035872220196658426,
|
|
"loss": 4.8677,
|
|
"mean_token_accuracy": 0.19326485991477965,
|
|
"num_tokens": 9573470.0,
|
|
"step": 5450
|
|
},
|
|
{
|
|
"entropy": 5.3626768589019775,
|
|
"epoch": 4.275078369905956,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00035844293528160384,
|
|
"loss": 4.9022,
|
|
"mean_token_accuracy": 0.19184253066778184,
|
|
"num_tokens": 9581607.0,
|
|
"step": 5455
|
|
},
|
|
{
|
|
"entropy": 5.309332227706909,
|
|
"epoch": 4.278996865203762,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0003581635194705682,
|
|
"loss": 4.8317,
|
|
"mean_token_accuracy": 0.19801969826221466,
|
|
"num_tokens": 9589801.0,
|
|
"step": 5460
|
|
},
|
|
{
|
|
"entropy": 5.324623441696167,
|
|
"epoch": 4.282915360501567,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00035788395503283914,
|
|
"loss": 4.9073,
|
|
"mean_token_accuracy": 0.19051554352045058,
|
|
"num_tokens": 9599983.0,
|
|
"step": 5465
|
|
},
|
|
{
|
|
"entropy": 5.36219277381897,
|
|
"epoch": 4.286833855799373,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00035760424246804334,
|
|
"loss": 4.8104,
|
|
"mean_token_accuracy": 0.2058953285217285,
|
|
"num_tokens": 9608681.0,
|
|
"step": 5470
|
|
},
|
|
{
|
|
"entropy": 5.276078796386718,
|
|
"epoch": 4.290752351097178,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0003573243822760726,
|
|
"loss": 4.8276,
|
|
"mean_token_accuracy": 0.2004624202847481,
|
|
"num_tokens": 9616461.0,
|
|
"step": 5475
|
|
},
|
|
{
|
|
"entropy": 5.330774974822998,
|
|
"epoch": 4.294670846394984,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0003570443749570825,
|
|
"loss": 4.9218,
|
|
"mean_token_accuracy": 0.19378992766141892,
|
|
"num_tokens": 9625480.0,
|
|
"step": 5480
|
|
},
|
|
{
|
|
"entropy": 5.361484670639038,
|
|
"epoch": 4.29858934169279,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0003567642210114914,
|
|
"loss": 4.8862,
|
|
"mean_token_accuracy": 0.19541665762662888,
|
|
"num_tokens": 9634172.0,
|
|
"step": 5485
|
|
},
|
|
{
|
|
"entropy": 5.2651701927185055,
|
|
"epoch": 4.302507836990595,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00035648392093998,
|
|
"loss": 4.8552,
|
|
"mean_token_accuracy": 0.19159961491823196,
|
|
"num_tokens": 9643196.0,
|
|
"step": 5490
|
|
},
|
|
{
|
|
"entropy": 5.298784971237183,
|
|
"epoch": 4.306426332288401,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0003562034752434899,
|
|
"loss": 4.7723,
|
|
"mean_token_accuracy": 0.20255262702703475,
|
|
"num_tokens": 9652440.0,
|
|
"step": 5495
|
|
},
|
|
{
|
|
"entropy": 5.311981439590454,
|
|
"epoch": 4.310344827586207,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0003559228844232229,
|
|
"loss": 4.8611,
|
|
"mean_token_accuracy": 0.19020333439111708,
|
|
"num_tokens": 9661248.0,
|
|
"step": 5500
|
|
},
|
|
{
|
|
"epoch": 4.310344827586207,
|
|
"eval_entropy": 5.248077216998551,
|
|
"eval_loss": 5.749776363372803,
|
|
"eval_mean_token_accuracy": 0.1632895873383034,
|
|
"eval_num_tokens": 9661248.0,
|
|
"eval_runtime": 2.8276,
|
|
"eval_samples_per_second": 1457.793,
|
|
"eval_steps_per_second": 182.489,
|
|
"step": 5500
|
|
},
|
|
{
|
|
"entropy": 5.37710165977478,
|
|
"epoch": 4.314263322884012,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0003556421489806406,
|
|
"loss": 4.9139,
|
|
"mean_token_accuracy": 0.19273322820663452,
|
|
"num_tokens": 9670544.0,
|
|
"step": 5505
|
|
},
|
|
{
|
|
"entropy": 5.366926431655884,
|
|
"epoch": 4.318181818181818,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0003553612694174624,
|
|
"loss": 4.8651,
|
|
"mean_token_accuracy": 0.19907910525798797,
|
|
"num_tokens": 9679331.0,
|
|
"step": 5510
|
|
},
|
|
{
|
|
"entropy": 5.395900774002075,
|
|
"epoch": 4.322100313479623,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0003550802462356659,
|
|
"loss": 4.9481,
|
|
"mean_token_accuracy": 0.1884398877620697,
|
|
"num_tokens": 9688211.0,
|
|
"step": 5515
|
|
},
|
|
{
|
|
"entropy": 5.194250631332397,
|
|
"epoch": 4.326018808777429,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00035479907993748484,
|
|
"loss": 4.7976,
|
|
"mean_token_accuracy": 0.19826843291521073,
|
|
"num_tokens": 9696231.0,
|
|
"step": 5520
|
|
},
|
|
{
|
|
"entropy": 5.2936523914337155,
|
|
"epoch": 4.329937304075235,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00035451777102540915,
|
|
"loss": 4.8817,
|
|
"mean_token_accuracy": 0.189493390917778,
|
|
"num_tokens": 9705577.0,
|
|
"step": 5525
|
|
},
|
|
{
|
|
"entropy": 5.4261146068573,
|
|
"epoch": 4.33385579937304,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0003542363200021835,
|
|
"loss": 4.9046,
|
|
"mean_token_accuracy": 0.19722576886415483,
|
|
"num_tokens": 9715003.0,
|
|
"step": 5530
|
|
},
|
|
{
|
|
"entropy": 5.271376991271973,
|
|
"epoch": 4.337774294670846,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0003539547273708064,
|
|
"loss": 4.7675,
|
|
"mean_token_accuracy": 0.20243049114942552,
|
|
"num_tokens": 9723513.0,
|
|
"step": 5535
|
|
},
|
|
{
|
|
"entropy": 5.233092832565307,
|
|
"epoch": 4.341692789968652,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00035367299363452966,
|
|
"loss": 4.8751,
|
|
"mean_token_accuracy": 0.19770222902297974,
|
|
"num_tokens": 9731800.0,
|
|
"step": 5540
|
|
},
|
|
{
|
|
"entropy": 5.314305019378662,
|
|
"epoch": 4.3456112852664575,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00035339111929685694,
|
|
"loss": 4.8341,
|
|
"mean_token_accuracy": 0.19684212058782577,
|
|
"num_tokens": 9740356.0,
|
|
"step": 5545
|
|
},
|
|
{
|
|
"entropy": 5.353294086456299,
|
|
"epoch": 4.349529780564263,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00035310910486154365,
|
|
"loss": 4.8731,
|
|
"mean_token_accuracy": 0.18506744354963303,
|
|
"num_tokens": 9749529.0,
|
|
"step": 5550
|
|
},
|
|
{
|
|
"entropy": 5.318337631225586,
|
|
"epoch": 4.353448275862069,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0003528269508325951,
|
|
"loss": 4.8504,
|
|
"mean_token_accuracy": 0.19859633445739747,
|
|
"num_tokens": 9758750.0,
|
|
"step": 5555
|
|
},
|
|
{
|
|
"entropy": 5.278810739517212,
|
|
"epoch": 4.3573667711598745,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0003525446577142663,
|
|
"loss": 4.9476,
|
|
"mean_token_accuracy": 0.19108470678329467,
|
|
"num_tokens": 9766992.0,
|
|
"step": 5560
|
|
},
|
|
{
|
|
"entropy": 5.3885668277740475,
|
|
"epoch": 4.36128526645768,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.00035226222601106105,
|
|
"loss": 4.9549,
|
|
"mean_token_accuracy": 0.190015073120594,
|
|
"num_tokens": 9777022.0,
|
|
"step": 5565
|
|
},
|
|
{
|
|
"entropy": 5.261571455001831,
|
|
"epoch": 4.3652037617554855,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0003519796562277303,
|
|
"loss": 4.6896,
|
|
"mean_token_accuracy": 0.20998727828264235,
|
|
"num_tokens": 9785863.0,
|
|
"step": 5570
|
|
},
|
|
{
|
|
"entropy": 5.268274402618408,
|
|
"epoch": 4.3691222570532915,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0003516969488692723,
|
|
"loss": 4.869,
|
|
"mean_token_accuracy": 0.18851781636476517,
|
|
"num_tokens": 9794357.0,
|
|
"step": 5575
|
|
},
|
|
{
|
|
"entropy": 5.254127883911133,
|
|
"epoch": 4.3730407523510975,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0003514141044409308,
|
|
"loss": 4.8185,
|
|
"mean_token_accuracy": 0.1957067370414734,
|
|
"num_tokens": 9803128.0,
|
|
"step": 5580
|
|
},
|
|
{
|
|
"entropy": 5.3343264102935795,
|
|
"epoch": 4.3769592476489025,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0003511311234481947,
|
|
"loss": 4.92,
|
|
"mean_token_accuracy": 0.1910697802901268,
|
|
"num_tokens": 9812046.0,
|
|
"step": 5585
|
|
},
|
|
{
|
|
"entropy": 5.281258201599121,
|
|
"epoch": 4.3808777429467085,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00035084800639679695,
|
|
"loss": 4.8127,
|
|
"mean_token_accuracy": 0.20120692998170853,
|
|
"num_tokens": 9820935.0,
|
|
"step": 5590
|
|
},
|
|
{
|
|
"entropy": 5.3266654968261715,
|
|
"epoch": 4.3847962382445145,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00035056475379271356,
|
|
"loss": 4.9496,
|
|
"mean_token_accuracy": 0.19124875962734222,
|
|
"num_tokens": 9829585.0,
|
|
"step": 5595
|
|
},
|
|
{
|
|
"entropy": 5.37768030166626,
|
|
"epoch": 4.38871473354232,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0003502813661421629,
|
|
"loss": 4.9832,
|
|
"mean_token_accuracy": 0.18994595110416412,
|
|
"num_tokens": 9839302.0,
|
|
"step": 5600
|
|
},
|
|
{
|
|
"entropy": 5.32978367805481,
|
|
"epoch": 4.3926332288401255,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0003499978439516047,
|
|
"loss": 4.892,
|
|
"mean_token_accuracy": 0.20099774301052092,
|
|
"num_tokens": 9848330.0,
|
|
"step": 5605
|
|
},
|
|
{
|
|
"entropy": 5.289576721191406,
|
|
"epoch": 4.396551724137931,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00034971418772773895,
|
|
"loss": 4.8436,
|
|
"mean_token_accuracy": 0.20694828331470488,
|
|
"num_tokens": 9856928.0,
|
|
"step": 5610
|
|
},
|
|
{
|
|
"entropy": 5.3501616477966305,
|
|
"epoch": 4.400470219435737,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0003494303979775054,
|
|
"loss": 4.8947,
|
|
"mean_token_accuracy": 0.19155489802360534,
|
|
"num_tokens": 9865464.0,
|
|
"step": 5615
|
|
},
|
|
{
|
|
"entropy": 5.320032119750977,
|
|
"epoch": 4.4043887147335425,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00034914647520808235,
|
|
"loss": 4.8677,
|
|
"mean_token_accuracy": 0.1985853910446167,
|
|
"num_tokens": 9874345.0,
|
|
"step": 5620
|
|
},
|
|
{
|
|
"entropy": 5.217613315582275,
|
|
"epoch": 4.408307210031348,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0003488624199268859,
|
|
"loss": 4.8358,
|
|
"mean_token_accuracy": 0.20405414700508118,
|
|
"num_tokens": 9883135.0,
|
|
"step": 5625
|
|
},
|
|
{
|
|
"entropy": 5.3447154521942135,
|
|
"epoch": 4.412225705329154,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0003485782326415687,
|
|
"loss": 4.9162,
|
|
"mean_token_accuracy": 0.19513294845819473,
|
|
"num_tokens": 9892229.0,
|
|
"step": 5630
|
|
},
|
|
{
|
|
"entropy": 5.355678558349609,
|
|
"epoch": 4.41614420062696,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0003482939138600197,
|
|
"loss": 4.9231,
|
|
"mean_token_accuracy": 0.1947724536061287,
|
|
"num_tokens": 9901044.0,
|
|
"step": 5635
|
|
},
|
|
{
|
|
"entropy": 5.351405143737793,
|
|
"epoch": 4.420062695924765,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0003480094640903626,
|
|
"loss": 4.931,
|
|
"mean_token_accuracy": 0.19275193065404891,
|
|
"num_tokens": 9909818.0,
|
|
"step": 5640
|
|
},
|
|
{
|
|
"entropy": 5.254307508468628,
|
|
"epoch": 4.423981191222571,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00034772488384095517,
|
|
"loss": 4.7924,
|
|
"mean_token_accuracy": 0.20588268637657164,
|
|
"num_tokens": 9919295.0,
|
|
"step": 5645
|
|
},
|
|
{
|
|
"entropy": 5.285707759857178,
|
|
"epoch": 4.427899686520377,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.00034744017362038854,
|
|
"loss": 4.8964,
|
|
"mean_token_accuracy": 0.193529711663723,
|
|
"num_tokens": 9927539.0,
|
|
"step": 5650
|
|
},
|
|
{
|
|
"entropy": 5.312068700790405,
|
|
"epoch": 4.431818181818182,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00034715533393748604,
|
|
"loss": 4.9331,
|
|
"mean_token_accuracy": 0.19326264411211014,
|
|
"num_tokens": 9936540.0,
|
|
"step": 5655
|
|
},
|
|
{
|
|
"entropy": 5.300959777832031,
|
|
"epoch": 4.435736677115988,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00034687036530130244,
|
|
"loss": 4.7829,
|
|
"mean_token_accuracy": 0.20409155637025833,
|
|
"num_tokens": 9945065.0,
|
|
"step": 5660
|
|
},
|
|
{
|
|
"entropy": 5.309552812576294,
|
|
"epoch": 4.439655172413793,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0003465852682211227,
|
|
"loss": 4.8241,
|
|
"mean_token_accuracy": 0.19703736603260041,
|
|
"num_tokens": 9954155.0,
|
|
"step": 5665
|
|
},
|
|
{
|
|
"entropy": 5.282710695266724,
|
|
"epoch": 4.443573667711599,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0003463000432064617,
|
|
"loss": 4.9015,
|
|
"mean_token_accuracy": 0.19094373732805253,
|
|
"num_tokens": 9963269.0,
|
|
"step": 5670
|
|
},
|
|
{
|
|
"entropy": 5.362774848937988,
|
|
"epoch": 4.447492163009405,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00034601469076706276,
|
|
"loss": 4.9442,
|
|
"mean_token_accuracy": 0.19358460158109664,
|
|
"num_tokens": 9972688.0,
|
|
"step": 5675
|
|
},
|
|
{
|
|
"entropy": 5.4085431575775145,
|
|
"epoch": 4.45141065830721,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0003457292114128968,
|
|
"loss": 4.8887,
|
|
"mean_token_accuracy": 0.19286465495824814,
|
|
"num_tokens": 9981283.0,
|
|
"step": 5680
|
|
},
|
|
{
|
|
"entropy": 5.274885272979736,
|
|
"epoch": 4.455329153605016,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.000345443605654162,
|
|
"loss": 4.9088,
|
|
"mean_token_accuracy": 0.1932951644062996,
|
|
"num_tokens": 9989870.0,
|
|
"step": 5685
|
|
},
|
|
{
|
|
"entropy": 5.293261957168579,
|
|
"epoch": 4.459247648902822,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.00034515787400128195,
|
|
"loss": 4.9129,
|
|
"mean_token_accuracy": 0.19092471301555633,
|
|
"num_tokens": 9998136.0,
|
|
"step": 5690
|
|
},
|
|
{
|
|
"entropy": 5.332083702087402,
|
|
"epoch": 4.463166144200627,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0003448720169649054,
|
|
"loss": 4.8605,
|
|
"mean_token_accuracy": 0.1922208473086357,
|
|
"num_tokens": 10006730.0,
|
|
"step": 5695
|
|
},
|
|
{
|
|
"entropy": 5.337637710571289,
|
|
"epoch": 4.467084639498433,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0003445860350559055,
|
|
"loss": 4.8506,
|
|
"mean_token_accuracy": 0.20467009395360947,
|
|
"num_tokens": 10015517.0,
|
|
"step": 5700
|
|
},
|
|
{
|
|
"entropy": 5.234624528884888,
|
|
"epoch": 4.471003134796238,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.000344299928785378,
|
|
"loss": 4.9077,
|
|
"mean_token_accuracy": 0.19626541435718536,
|
|
"num_tokens": 10024552.0,
|
|
"step": 5705
|
|
},
|
|
{
|
|
"entropy": 5.289376926422119,
|
|
"epoch": 4.474921630094044,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00034401369866464133,
|
|
"loss": 4.8799,
|
|
"mean_token_accuracy": 0.19567616283893585,
|
|
"num_tokens": 10033523.0,
|
|
"step": 5710
|
|
},
|
|
{
|
|
"entropy": 5.276830768585205,
|
|
"epoch": 4.47884012539185,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00034372734520523505,
|
|
"loss": 4.7729,
|
|
"mean_token_accuracy": 0.19215544909238816,
|
|
"num_tokens": 10041988.0,
|
|
"step": 5715
|
|
},
|
|
{
|
|
"entropy": 5.254915189743042,
|
|
"epoch": 4.482758620689655,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0003434408689189193,
|
|
"loss": 4.8576,
|
|
"mean_token_accuracy": 0.20303562581539153,
|
|
"num_tokens": 10050543.0,
|
|
"step": 5720
|
|
},
|
|
{
|
|
"entropy": 5.3186054706573485,
|
|
"epoch": 4.486677115987461,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0003431542703176737,
|
|
"loss": 4.8736,
|
|
"mean_token_accuracy": 0.1990189731121063,
|
|
"num_tokens": 10059300.0,
|
|
"step": 5725
|
|
},
|
|
{
|
|
"entropy": 5.299335289001465,
|
|
"epoch": 4.490595611285267,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0003428675499136963,
|
|
"loss": 4.9096,
|
|
"mean_token_accuracy": 0.1965264245867729,
|
|
"num_tokens": 10068535.0,
|
|
"step": 5730
|
|
},
|
|
{
|
|
"entropy": 5.33660740852356,
|
|
"epoch": 4.494514106583072,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.000342580708219403,
|
|
"loss": 4.9551,
|
|
"mean_token_accuracy": 0.19655793607234956,
|
|
"num_tokens": 10077731.0,
|
|
"step": 5735
|
|
},
|
|
{
|
|
"entropy": 5.306480646133423,
|
|
"epoch": 4.498432601880878,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0003422937457474264,
|
|
"loss": 4.8472,
|
|
"mean_token_accuracy": 0.19078222960233687,
|
|
"num_tokens": 10087146.0,
|
|
"step": 5740
|
|
},
|
|
{
|
|
"entropy": 5.302515649795533,
|
|
"epoch": 4.502351097178684,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00034200666301061495,
|
|
"loss": 4.8527,
|
|
"mean_token_accuracy": 0.19615772366523743,
|
|
"num_tokens": 10096659.0,
|
|
"step": 5745
|
|
},
|
|
{
|
|
"entropy": 5.330955839157104,
|
|
"epoch": 4.506269592476489,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00034171946052203217,
|
|
"loss": 4.9463,
|
|
"mean_token_accuracy": 0.1952238753437996,
|
|
"num_tokens": 10105403.0,
|
|
"step": 5750
|
|
},
|
|
{
|
|
"entropy": 5.405318927764893,
|
|
"epoch": 4.510188087774295,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.00034143213879495526,
|
|
"loss": 4.8819,
|
|
"mean_token_accuracy": 0.1959804505109787,
|
|
"num_tokens": 10115410.0,
|
|
"step": 5755
|
|
},
|
|
{
|
|
"entropy": 5.30314073562622,
|
|
"epoch": 4.514106583072101,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.000341144698342875,
|
|
"loss": 4.9059,
|
|
"mean_token_accuracy": 0.19124898910522461,
|
|
"num_tokens": 10123800.0,
|
|
"step": 5760
|
|
},
|
|
{
|
|
"entropy": 5.359114027023315,
|
|
"epoch": 4.518025078369906,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0003408571396794939,
|
|
"loss": 4.9269,
|
|
"mean_token_accuracy": 0.19253372251987458,
|
|
"num_tokens": 10132775.0,
|
|
"step": 5765
|
|
},
|
|
{
|
|
"entropy": 5.300523567199707,
|
|
"epoch": 4.521943573667712,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0003405694633187259,
|
|
"loss": 4.8243,
|
|
"mean_token_accuracy": 0.19811718314886093,
|
|
"num_tokens": 10141811.0,
|
|
"step": 5770
|
|
},
|
|
{
|
|
"entropy": 5.239429950714111,
|
|
"epoch": 4.525862068965517,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00034028166977469545,
|
|
"loss": 4.8413,
|
|
"mean_token_accuracy": 0.1970396563410759,
|
|
"num_tokens": 10150823.0,
|
|
"step": 5775
|
|
},
|
|
{
|
|
"entropy": 5.328087615966797,
|
|
"epoch": 4.529780564263323,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00033999375956173607,
|
|
"loss": 4.8443,
|
|
"mean_token_accuracy": 0.19740379452705384,
|
|
"num_tokens": 10158983.0,
|
|
"step": 5780
|
|
},
|
|
{
|
|
"entropy": 5.348503494262696,
|
|
"epoch": 4.533699059561129,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0003397057331943902,
|
|
"loss": 4.8488,
|
|
"mean_token_accuracy": 0.19886207431554795,
|
|
"num_tokens": 10167596.0,
|
|
"step": 5785
|
|
},
|
|
{
|
|
"entropy": 5.278054809570312,
|
|
"epoch": 4.537617554858934,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0003394175911874076,
|
|
"loss": 4.9183,
|
|
"mean_token_accuracy": 0.1959143877029419,
|
|
"num_tokens": 10176216.0,
|
|
"step": 5790
|
|
},
|
|
{
|
|
"entropy": 5.27154655456543,
|
|
"epoch": 4.54153605015674,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0003391293340557446,
|
|
"loss": 4.8187,
|
|
"mean_token_accuracy": 0.196979558467865,
|
|
"num_tokens": 10185685.0,
|
|
"step": 5795
|
|
},
|
|
{
|
|
"entropy": 5.311609601974487,
|
|
"epoch": 4.545454545454545,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0003388409623145637,
|
|
"loss": 4.8302,
|
|
"mean_token_accuracy": 0.19562277495861052,
|
|
"num_tokens": 10193679.0,
|
|
"step": 5800
|
|
},
|
|
{
|
|
"entropy": 5.314465045928955,
|
|
"epoch": 4.549373040752351,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.00033855247647923177,
|
|
"loss": 4.8847,
|
|
"mean_token_accuracy": 0.2017224296927452,
|
|
"num_tokens": 10201154.0,
|
|
"step": 5805
|
|
},
|
|
{
|
|
"entropy": 5.291952085494995,
|
|
"epoch": 4.553291536050157,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0003382638770653198,
|
|
"loss": 4.8034,
|
|
"mean_token_accuracy": 0.20398634672164917,
|
|
"num_tokens": 10209180.0,
|
|
"step": 5810
|
|
},
|
|
{
|
|
"entropy": 5.332205009460449,
|
|
"epoch": 4.557210031347962,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0003379751645886017,
|
|
"loss": 4.8946,
|
|
"mean_token_accuracy": 0.19163894057273864,
|
|
"num_tokens": 10217430.0,
|
|
"step": 5815
|
|
},
|
|
{
|
|
"entropy": 5.289204931259155,
|
|
"epoch": 4.561128526645768,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0003376863395650537,
|
|
"loss": 4.9579,
|
|
"mean_token_accuracy": 0.19430561512708663,
|
|
"num_tokens": 10226453.0,
|
|
"step": 5820
|
|
},
|
|
{
|
|
"entropy": 5.411459732055664,
|
|
"epoch": 4.565047021943574,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00033739740251085263,
|
|
"loss": 4.988,
|
|
"mean_token_accuracy": 0.19136859327554703,
|
|
"num_tokens": 10235005.0,
|
|
"step": 5825
|
|
},
|
|
{
|
|
"entropy": 5.403548669815064,
|
|
"epoch": 4.568965517241379,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.00033710835394237603,
|
|
"loss": 4.8996,
|
|
"mean_token_accuracy": 0.19628941118717194,
|
|
"num_tokens": 10243800.0,
|
|
"step": 5830
|
|
},
|
|
{
|
|
"entropy": 5.262275171279907,
|
|
"epoch": 4.572884012539185,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00033681919437620066,
|
|
"loss": 4.8703,
|
|
"mean_token_accuracy": 0.2014416426420212,
|
|
"num_tokens": 10252370.0,
|
|
"step": 5835
|
|
},
|
|
{
|
|
"entropy": 5.295157718658447,
|
|
"epoch": 4.576802507836991,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0003365299243291014,
|
|
"loss": 4.9385,
|
|
"mean_token_accuracy": 0.19208094030618666,
|
|
"num_tokens": 10260777.0,
|
|
"step": 5840
|
|
},
|
|
{
|
|
"entropy": 5.319491243362426,
|
|
"epoch": 4.580721003134796,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00033624054431805067,
|
|
"loss": 4.9397,
|
|
"mean_token_accuracy": 0.1825719639658928,
|
|
"num_tokens": 10270571.0,
|
|
"step": 5845
|
|
},
|
|
{
|
|
"entropy": 5.391564750671387,
|
|
"epoch": 4.584639498432602,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00033595105486021763,
|
|
"loss": 4.9174,
|
|
"mean_token_accuracy": 0.1958574578166008,
|
|
"num_tokens": 10279284.0,
|
|
"step": 5850
|
|
},
|
|
{
|
|
"entropy": 5.274766731262207,
|
|
"epoch": 4.588557993730408,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00033566145647296675,
|
|
"loss": 4.8215,
|
|
"mean_token_accuracy": 0.19175848811864854,
|
|
"num_tokens": 10288610.0,
|
|
"step": 5855
|
|
},
|
|
{
|
|
"entropy": 5.26464262008667,
|
|
"epoch": 4.592476489028213,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00033537174967385744,
|
|
"loss": 4.8178,
|
|
"mean_token_accuracy": 0.19997525513172149,
|
|
"num_tokens": 10296603.0,
|
|
"step": 5860
|
|
},
|
|
{
|
|
"entropy": 5.231957292556762,
|
|
"epoch": 4.596394984326019,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00033508193498064254,
|
|
"loss": 4.8005,
|
|
"mean_token_accuracy": 0.20198751240968704,
|
|
"num_tokens": 10305199.0,
|
|
"step": 5865
|
|
},
|
|
{
|
|
"entropy": 5.2481550693511965,
|
|
"epoch": 4.600313479623824,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00033479201291126807,
|
|
"loss": 4.7616,
|
|
"mean_token_accuracy": 0.20384960919618605,
|
|
"num_tokens": 10313967.0,
|
|
"step": 5870
|
|
},
|
|
{
|
|
"entropy": 5.285054874420166,
|
|
"epoch": 4.60423197492163,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0003345019839838717,
|
|
"loss": 4.8634,
|
|
"mean_token_accuracy": 0.20132818669080735,
|
|
"num_tokens": 10322532.0,
|
|
"step": 5875
|
|
},
|
|
{
|
|
"entropy": 5.2304784774780275,
|
|
"epoch": 4.608150470219436,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0003342118487167822,
|
|
"loss": 4.7192,
|
|
"mean_token_accuracy": 0.20240301340818406,
|
|
"num_tokens": 10330631.0,
|
|
"step": 5880
|
|
},
|
|
{
|
|
"entropy": 5.203008651733398,
|
|
"epoch": 4.612068965517241,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00033392160762851813,
|
|
"loss": 4.7726,
|
|
"mean_token_accuracy": 0.20057196766138077,
|
|
"num_tokens": 10339229.0,
|
|
"step": 5885
|
|
},
|
|
{
|
|
"entropy": 5.33842830657959,
|
|
"epoch": 4.615987460815047,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0003336312612377877,
|
|
"loss": 4.8712,
|
|
"mean_token_accuracy": 0.19062244594097139,
|
|
"num_tokens": 10348882.0,
|
|
"step": 5890
|
|
},
|
|
{
|
|
"entropy": 5.405837821960449,
|
|
"epoch": 4.619905956112853,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.00033334081006348683,
|
|
"loss": 5.0137,
|
|
"mean_token_accuracy": 0.18338688015937804,
|
|
"num_tokens": 10358154.0,
|
|
"step": 5895
|
|
},
|
|
{
|
|
"entropy": 5.326955509185791,
|
|
"epoch": 4.623824451410658,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0003330502546246988,
|
|
"loss": 4.9173,
|
|
"mean_token_accuracy": 0.19435677230358123,
|
|
"num_tokens": 10367100.0,
|
|
"step": 5900
|
|
},
|
|
{
|
|
"entropy": 5.228698635101319,
|
|
"epoch": 4.627742946708464,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00033275959544069335,
|
|
"loss": 4.7217,
|
|
"mean_token_accuracy": 0.21096832901239396,
|
|
"num_tokens": 10375711.0,
|
|
"step": 5905
|
|
},
|
|
{
|
|
"entropy": 5.298200368881226,
|
|
"epoch": 4.631661442006269,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00033246883303092564,
|
|
"loss": 4.8347,
|
|
"mean_token_accuracy": 0.19578275382518767,
|
|
"num_tokens": 10384513.0,
|
|
"step": 5910
|
|
},
|
|
{
|
|
"entropy": 5.32994966506958,
|
|
"epoch": 4.635579937304075,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00033217796791503503,
|
|
"loss": 4.8979,
|
|
"mean_token_accuracy": 0.19180350005626678,
|
|
"num_tokens": 10393564.0,
|
|
"step": 5915
|
|
},
|
|
{
|
|
"entropy": 5.340075159072876,
|
|
"epoch": 4.639498432601881,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0003318870006128449,
|
|
"loss": 4.9009,
|
|
"mean_token_accuracy": 0.19497937262058257,
|
|
"num_tokens": 10402042.0,
|
|
"step": 5920
|
|
},
|
|
{
|
|
"entropy": 5.2525848865509035,
|
|
"epoch": 4.643416927899686,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0003315959316443608,
|
|
"loss": 4.8005,
|
|
"mean_token_accuracy": 0.19507636427879332,
|
|
"num_tokens": 10410847.0,
|
|
"step": 5925
|
|
},
|
|
{
|
|
"entropy": 5.302730655670166,
|
|
"epoch": 4.647335423197492,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0003313047615297703,
|
|
"loss": 4.9173,
|
|
"mean_token_accuracy": 0.19071874022483826,
|
|
"num_tokens": 10420366.0,
|
|
"step": 5930
|
|
},
|
|
{
|
|
"entropy": 5.27545428276062,
|
|
"epoch": 4.651253918495298,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.00033101349078944165,
|
|
"loss": 4.8207,
|
|
"mean_token_accuracy": 0.1988954409956932,
|
|
"num_tokens": 10428870.0,
|
|
"step": 5935
|
|
},
|
|
{
|
|
"entropy": 5.289353847503662,
|
|
"epoch": 4.655172413793103,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0003307221199439227,
|
|
"loss": 4.8875,
|
|
"mean_token_accuracy": 0.1902886673808098,
|
|
"num_tokens": 10437714.0,
|
|
"step": 5940
|
|
},
|
|
{
|
|
"entropy": 5.229666614532471,
|
|
"epoch": 4.659090909090909,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00033043064951394045,
|
|
"loss": 4.7743,
|
|
"mean_token_accuracy": 0.20280301570892334,
|
|
"num_tokens": 10447198.0,
|
|
"step": 5945
|
|
},
|
|
{
|
|
"entropy": 5.318718528747558,
|
|
"epoch": 4.663009404388715,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.00033013908002039994,
|
|
"loss": 4.8343,
|
|
"mean_token_accuracy": 0.20482680797576905,
|
|
"num_tokens": 10455790.0,
|
|
"step": 5950
|
|
},
|
|
{
|
|
"entropy": 5.347198152542115,
|
|
"epoch": 4.66692789968652,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00032984741198438305,
|
|
"loss": 4.9279,
|
|
"mean_token_accuracy": 0.19780130237340926,
|
|
"num_tokens": 10464094.0,
|
|
"step": 5955
|
|
},
|
|
{
|
|
"entropy": 5.271975326538086,
|
|
"epoch": 4.670846394984326,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0003295556459271478,
|
|
"loss": 4.9087,
|
|
"mean_token_accuracy": 0.19745671898126602,
|
|
"num_tokens": 10473201.0,
|
|
"step": 5960
|
|
},
|
|
{
|
|
"entropy": 5.3372969150543215,
|
|
"epoch": 4.674764890282132,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0003292637823701276,
|
|
"loss": 4.9174,
|
|
"mean_token_accuracy": 0.18904770612716676,
|
|
"num_tokens": 10482617.0,
|
|
"step": 5965
|
|
},
|
|
{
|
|
"entropy": 5.318005990982056,
|
|
"epoch": 4.678683385579937,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0003289718218349298,
|
|
"loss": 4.7931,
|
|
"mean_token_accuracy": 0.20507727265357972,
|
|
"num_tokens": 10491702.0,
|
|
"step": 5970
|
|
},
|
|
{
|
|
"entropy": 5.395375823974609,
|
|
"epoch": 4.682601880877743,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0003286797648433351,
|
|
"loss": 4.9914,
|
|
"mean_token_accuracy": 0.19012427777051927,
|
|
"num_tokens": 10501189.0,
|
|
"step": 5975
|
|
},
|
|
{
|
|
"entropy": 5.289163064956665,
|
|
"epoch": 4.686520376175548,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.000328387611917297,
|
|
"loss": 4.8558,
|
|
"mean_token_accuracy": 0.19690819084644318,
|
|
"num_tokens": 10510272.0,
|
|
"step": 5980
|
|
},
|
|
{
|
|
"entropy": 5.362410306930542,
|
|
"epoch": 4.690438871473354,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0003280953635789401,
|
|
"loss": 4.9339,
|
|
"mean_token_accuracy": 0.19246556907892226,
|
|
"num_tokens": 10518782.0,
|
|
"step": 5985
|
|
},
|
|
{
|
|
"entropy": 5.279406404495239,
|
|
"epoch": 4.69435736677116,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0003278030203505594,
|
|
"loss": 4.8822,
|
|
"mean_token_accuracy": 0.19507061094045638,
|
|
"num_tokens": 10528262.0,
|
|
"step": 5990
|
|
},
|
|
{
|
|
"entropy": 5.220873260498047,
|
|
"epoch": 4.698275862068965,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0003275105827546197,
|
|
"loss": 4.8613,
|
|
"mean_token_accuracy": 0.19446012526750564,
|
|
"num_tokens": 10536399.0,
|
|
"step": 5995
|
|
},
|
|
{
|
|
"entropy": 5.352699279785156,
|
|
"epoch": 4.702194357366771,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00032721805131375446,
|
|
"loss": 4.9148,
|
|
"mean_token_accuracy": 0.19234919399023057,
|
|
"num_tokens": 10544842.0,
|
|
"step": 6000
|
|
},
|
|
{
|
|
"epoch": 4.702194357366771,
|
|
"eval_entropy": 5.2583259491957435,
|
|
"eval_loss": 5.718573093414307,
|
|
"eval_mean_token_accuracy": 0.16578412574333276,
|
|
"eval_num_tokens": 10544842.0,
|
|
"eval_runtime": 3.0114,
|
|
"eval_samples_per_second": 1368.819,
|
|
"eval_steps_per_second": 171.351,
|
|
"step": 6000
|
|
},
|
|
{
|
|
"entropy": 5.326291799545288,
|
|
"epoch": 4.706112852664576,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0003269254265507647,
|
|
"loss": 4.857,
|
|
"mean_token_accuracy": 0.1958915650844574,
|
|
"num_tokens": 10553630.0,
|
|
"step": 6005
|
|
},
|
|
{
|
|
"entropy": 5.236906337738037,
|
|
"epoch": 4.710031347962382,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.00032663270898861845,
|
|
"loss": 4.8591,
|
|
"mean_token_accuracy": 0.19698517322540282,
|
|
"num_tokens": 10561714.0,
|
|
"step": 6010
|
|
},
|
|
{
|
|
"entropy": 5.309299564361572,
|
|
"epoch": 4.713949843260188,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00032633989915044944,
|
|
"loss": 4.9394,
|
|
"mean_token_accuracy": 0.19155541956424713,
|
|
"num_tokens": 10571075.0,
|
|
"step": 6015
|
|
},
|
|
{
|
|
"entropy": 5.335476875305176,
|
|
"epoch": 4.717868338557993,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.00032604699755955615,
|
|
"loss": 4.823,
|
|
"mean_token_accuracy": 0.20272691249847413,
|
|
"num_tokens": 10579698.0,
|
|
"step": 6020
|
|
},
|
|
{
|
|
"entropy": 5.258983755111695,
|
|
"epoch": 4.721786833855799,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00032575400473940135,
|
|
"loss": 4.8593,
|
|
"mean_token_accuracy": 0.1937871977686882,
|
|
"num_tokens": 10588060.0,
|
|
"step": 6025
|
|
},
|
|
{
|
|
"entropy": 5.232053327560425,
|
|
"epoch": 4.725705329153605,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0003254609212136108,
|
|
"loss": 4.8849,
|
|
"mean_token_accuracy": 0.19797635674476624,
|
|
"num_tokens": 10596482.0,
|
|
"step": 6030
|
|
},
|
|
{
|
|
"entropy": 5.281117677688599,
|
|
"epoch": 4.72962382445141,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0003251677475059721,
|
|
"loss": 4.9074,
|
|
"mean_token_accuracy": 0.19144712686538695,
|
|
"num_tokens": 10605630.0,
|
|
"step": 6035
|
|
},
|
|
{
|
|
"entropy": 5.352448415756226,
|
|
"epoch": 4.733542319749216,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00032487448414043435,
|
|
"loss": 4.9064,
|
|
"mean_token_accuracy": 0.198373182117939,
|
|
"num_tokens": 10614772.0,
|
|
"step": 6040
|
|
},
|
|
{
|
|
"entropy": 5.3052754402160645,
|
|
"epoch": 4.737460815047022,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00032458113164110665,
|
|
"loss": 4.8833,
|
|
"mean_token_accuracy": 0.19952247738838197,
|
|
"num_tokens": 10623583.0,
|
|
"step": 6045
|
|
},
|
|
{
|
|
"entropy": 5.3142999649047855,
|
|
"epoch": 4.741379310344827,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0003242876905322575,
|
|
"loss": 4.8313,
|
|
"mean_token_accuracy": 0.2084073856472969,
|
|
"num_tokens": 10632136.0,
|
|
"step": 6050
|
|
},
|
|
{
|
|
"entropy": 5.291184377670288,
|
|
"epoch": 4.745297805642633,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0003239941613383138,
|
|
"loss": 4.9275,
|
|
"mean_token_accuracy": 0.19395726472139357,
|
|
"num_tokens": 10641306.0,
|
|
"step": 6055
|
|
},
|
|
{
|
|
"entropy": 5.308550596237183,
|
|
"epoch": 4.749216300940439,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00032370054458385973,
|
|
"loss": 4.872,
|
|
"mean_token_accuracy": 0.19650894552469253,
|
|
"num_tokens": 10650866.0,
|
|
"step": 6060
|
|
},
|
|
{
|
|
"entropy": 5.341393852233887,
|
|
"epoch": 4.753134796238244,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.000323406840793636,
|
|
"loss": 4.9002,
|
|
"mean_token_accuracy": 0.20345784425735475,
|
|
"num_tokens": 10659252.0,
|
|
"step": 6065
|
|
},
|
|
{
|
|
"entropy": 5.284671068191528,
|
|
"epoch": 4.75705329153605,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.000323113050492539,
|
|
"loss": 4.7826,
|
|
"mean_token_accuracy": 0.20257387161254883,
|
|
"num_tokens": 10667987.0,
|
|
"step": 6070
|
|
},
|
|
{
|
|
"entropy": 5.2915153980255125,
|
|
"epoch": 4.7609717868338555,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00032281917420561967,
|
|
"loss": 4.9641,
|
|
"mean_token_accuracy": 0.1922558456659317,
|
|
"num_tokens": 10676524.0,
|
|
"step": 6075
|
|
},
|
|
{
|
|
"entropy": 5.289895725250244,
|
|
"epoch": 4.764890282131661,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0003225252124580825,
|
|
"loss": 4.8958,
|
|
"mean_token_accuracy": 0.19667108356952667,
|
|
"num_tokens": 10685940.0,
|
|
"step": 6080
|
|
},
|
|
{
|
|
"entropy": 5.310254621505737,
|
|
"epoch": 4.768808777429467,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00032223116577528475,
|
|
"loss": 4.8928,
|
|
"mean_token_accuracy": 0.19574060738086702,
|
|
"num_tokens": 10694522.0,
|
|
"step": 6085
|
|
},
|
|
{
|
|
"entropy": 5.287983465194702,
|
|
"epoch": 4.7727272727272725,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0003219370346827356,
|
|
"loss": 4.8294,
|
|
"mean_token_accuracy": 0.19738385379314421,
|
|
"num_tokens": 10703868.0,
|
|
"step": 6090
|
|
},
|
|
{
|
|
"entropy": 5.264180278778076,
|
|
"epoch": 4.7766457680250785,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.000321642819706095,
|
|
"loss": 4.8279,
|
|
"mean_token_accuracy": 0.19939566552639007,
|
|
"num_tokens": 10712044.0,
|
|
"step": 6095
|
|
},
|
|
{
|
|
"entropy": 5.365434455871582,
|
|
"epoch": 4.7805642633228835,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0003213485213711729,
|
|
"loss": 5.0228,
|
|
"mean_token_accuracy": 0.1870979204773903,
|
|
"num_tokens": 10720806.0,
|
|
"step": 6100
|
|
},
|
|
{
|
|
"entropy": 5.235752105712891,
|
|
"epoch": 4.7844827586206895,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.000321054140203928,
|
|
"loss": 4.7508,
|
|
"mean_token_accuracy": 0.20540912300348282,
|
|
"num_tokens": 10729238.0,
|
|
"step": 6105
|
|
},
|
|
{
|
|
"entropy": 5.253376197814942,
|
|
"epoch": 4.7884012539184955,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0003207596767304672,
|
|
"loss": 4.8335,
|
|
"mean_token_accuracy": 0.2007951557636261,
|
|
"num_tokens": 10738175.0,
|
|
"step": 6110
|
|
},
|
|
{
|
|
"entropy": 5.342274188995361,
|
|
"epoch": 4.7923197492163006,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00032046513147704456,
|
|
"loss": 4.8981,
|
|
"mean_token_accuracy": 0.1927331030368805,
|
|
"num_tokens": 10747475.0,
|
|
"step": 6115
|
|
},
|
|
{
|
|
"entropy": 5.332991027832032,
|
|
"epoch": 4.7962382445141065,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00032017050497006,
|
|
"loss": 4.8561,
|
|
"mean_token_accuracy": 0.19804591834545135,
|
|
"num_tokens": 10756739.0,
|
|
"step": 6120
|
|
},
|
|
{
|
|
"entropy": 5.250210237503052,
|
|
"epoch": 4.8001567398119125,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00031987579773605913,
|
|
"loss": 4.8656,
|
|
"mean_token_accuracy": 0.19694567918777467,
|
|
"num_tokens": 10765141.0,
|
|
"step": 6125
|
|
},
|
|
{
|
|
"entropy": 5.33366174697876,
|
|
"epoch": 4.804075235109718,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0003195810103017314,
|
|
"loss": 4.9471,
|
|
"mean_token_accuracy": 0.1937161296606064,
|
|
"num_tokens": 10774759.0,
|
|
"step": 6130
|
|
},
|
|
{
|
|
"entropy": 5.358836317062378,
|
|
"epoch": 4.8079937304075235,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0003192861431939098,
|
|
"loss": 4.9219,
|
|
"mean_token_accuracy": 0.19801453053951262,
|
|
"num_tokens": 10783717.0,
|
|
"step": 6135
|
|
},
|
|
{
|
|
"entropy": 5.293142557144165,
|
|
"epoch": 4.8119122257053295,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0003189911969395697,
|
|
"loss": 4.9059,
|
|
"mean_token_accuracy": 0.19318787902593612,
|
|
"num_tokens": 10792797.0,
|
|
"step": 6140
|
|
},
|
|
{
|
|
"entropy": 5.1863912582397464,
|
|
"epoch": 4.815830721003135,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00031869617206582777,
|
|
"loss": 4.7765,
|
|
"mean_token_accuracy": 0.2064749151468277,
|
|
"num_tokens": 10802249.0,
|
|
"step": 6145
|
|
},
|
|
{
|
|
"entropy": 5.280630588531494,
|
|
"epoch": 4.8197492163009406,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0003184010690999413,
|
|
"loss": 4.8703,
|
|
"mean_token_accuracy": 0.19510459303855895,
|
|
"num_tokens": 10810722.0,
|
|
"step": 6150
|
|
},
|
|
{
|
|
"entropy": 5.3425170421600345,
|
|
"epoch": 4.8236677115987465,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0003181058885693073,
|
|
"loss": 4.9496,
|
|
"mean_token_accuracy": 0.19361832737922668,
|
|
"num_tokens": 10819115.0,
|
|
"step": 6155
|
|
},
|
|
{
|
|
"entropy": 5.3338323593139645,
|
|
"epoch": 4.827586206896552,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0003178106310014612,
|
|
"loss": 4.9345,
|
|
"mean_token_accuracy": 0.19588030576705934,
|
|
"num_tokens": 10827419.0,
|
|
"step": 6160
|
|
},
|
|
{
|
|
"entropy": 5.217754173278808,
|
|
"epoch": 4.831504702194358,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0003175152969240759,
|
|
"loss": 4.8062,
|
|
"mean_token_accuracy": 0.20195651054382324,
|
|
"num_tokens": 10836610.0,
|
|
"step": 6165
|
|
},
|
|
{
|
|
"entropy": 5.292611122131348,
|
|
"epoch": 4.835423197492163,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00031721988686496156,
|
|
"loss": 4.8879,
|
|
"mean_token_accuracy": 0.1986537203192711,
|
|
"num_tokens": 10845127.0,
|
|
"step": 6170
|
|
},
|
|
{
|
|
"entropy": 5.289541482925415,
|
|
"epoch": 4.839341692789969,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0003169244013520638,
|
|
"loss": 4.8997,
|
|
"mean_token_accuracy": 0.20067002773284912,
|
|
"num_tokens": 10853458.0,
|
|
"step": 6175
|
|
},
|
|
{
|
|
"entropy": 5.373119592666626,
|
|
"epoch": 4.843260188087775,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.000316628840913463,
|
|
"loss": 4.9922,
|
|
"mean_token_accuracy": 0.18615848273038865,
|
|
"num_tokens": 10862134.0,
|
|
"step": 6180
|
|
},
|
|
{
|
|
"entropy": 5.358973217010498,
|
|
"epoch": 4.84717868338558,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0003163332060773738,
|
|
"loss": 4.9918,
|
|
"mean_token_accuracy": 0.1928128257393837,
|
|
"num_tokens": 10872127.0,
|
|
"step": 6185
|
|
},
|
|
{
|
|
"entropy": 5.31832365989685,
|
|
"epoch": 4.851097178683386,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0003160374973721434,
|
|
"loss": 4.9184,
|
|
"mean_token_accuracy": 0.18799601048231124,
|
|
"num_tokens": 10880617.0,
|
|
"step": 6190
|
|
},
|
|
{
|
|
"entropy": 5.349193668365478,
|
|
"epoch": 4.855015673981192,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0003157417153262513,
|
|
"loss": 4.8826,
|
|
"mean_token_accuracy": 0.1995572790503502,
|
|
"num_tokens": 10888769.0,
|
|
"step": 6195
|
|
},
|
|
{
|
|
"entropy": 5.310605478286743,
|
|
"epoch": 4.858934169278997,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00031544586046830796,
|
|
"loss": 4.8636,
|
|
"mean_token_accuracy": 0.1942999005317688,
|
|
"num_tokens": 10897638.0,
|
|
"step": 6200
|
|
},
|
|
{
|
|
"entropy": 5.275332307815551,
|
|
"epoch": 4.862852664576803,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.000315149933327054,
|
|
"loss": 4.8151,
|
|
"mean_token_accuracy": 0.1994323506951332,
|
|
"num_tokens": 10906224.0,
|
|
"step": 6205
|
|
},
|
|
{
|
|
"entropy": 5.2196086883544925,
|
|
"epoch": 4.866771159874608,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00031485393443135914,
|
|
"loss": 4.811,
|
|
"mean_token_accuracy": 0.20627595484256744,
|
|
"num_tokens": 10915390.0,
|
|
"step": 6210
|
|
},
|
|
{
|
|
"entropy": 5.3287028789520265,
|
|
"epoch": 4.870689655172414,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0003145578643102214,
|
|
"loss": 4.8887,
|
|
"mean_token_accuracy": 0.19652045220136644,
|
|
"num_tokens": 10923719.0,
|
|
"step": 6215
|
|
},
|
|
{
|
|
"entropy": 5.281140947341919,
|
|
"epoch": 4.87460815047022,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0003142617234927662,
|
|
"loss": 4.8626,
|
|
"mean_token_accuracy": 0.19312359094619752,
|
|
"num_tokens": 10932610.0,
|
|
"step": 6220
|
|
},
|
|
{
|
|
"entropy": 5.324745750427246,
|
|
"epoch": 4.878526645768025,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00031396551250824513,
|
|
"loss": 4.8403,
|
|
"mean_token_accuracy": 0.19821392446756364,
|
|
"num_tokens": 10941315.0,
|
|
"step": 6225
|
|
},
|
|
{
|
|
"entropy": 5.161372709274292,
|
|
"epoch": 4.882445141065831,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0003136692318860352,
|
|
"loss": 4.7968,
|
|
"mean_token_accuracy": 0.1987773060798645,
|
|
"num_tokens": 10949781.0,
|
|
"step": 6230
|
|
},
|
|
{
|
|
"entropy": 5.364647960662841,
|
|
"epoch": 4.886363636363637,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0003133728821556381,
|
|
"loss": 4.9936,
|
|
"mean_token_accuracy": 0.18877088725566865,
|
|
"num_tokens": 10958207.0,
|
|
"step": 6235
|
|
},
|
|
{
|
|
"entropy": 5.35259895324707,
|
|
"epoch": 4.890282131661442,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00031307646384667854,
|
|
"loss": 4.8955,
|
|
"mean_token_accuracy": 0.1886019766330719,
|
|
"num_tokens": 10967162.0,
|
|
"step": 6240
|
|
},
|
|
{
|
|
"entropy": 5.338934135437012,
|
|
"epoch": 4.894200626959248,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00031277997748890433,
|
|
"loss": 4.9579,
|
|
"mean_token_accuracy": 0.18787245750427245,
|
|
"num_tokens": 10975517.0,
|
|
"step": 6245
|
|
},
|
|
{
|
|
"entropy": 5.303982877731324,
|
|
"epoch": 4.898119122257054,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0003124834236121847,
|
|
"loss": 4.8305,
|
|
"mean_token_accuracy": 0.1967819020152092,
|
|
"num_tokens": 10984668.0,
|
|
"step": 6250
|
|
},
|
|
{
|
|
"entropy": 5.254429817199707,
|
|
"epoch": 4.902037617554859,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00031218680274650934,
|
|
"loss": 4.8247,
|
|
"mean_token_accuracy": 0.1917088508605957,
|
|
"num_tokens": 10993475.0,
|
|
"step": 6255
|
|
},
|
|
{
|
|
"entropy": 5.213412952423096,
|
|
"epoch": 4.905956112852665,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00031189011542198794,
|
|
"loss": 4.7938,
|
|
"mean_token_accuracy": 0.21038636267185212,
|
|
"num_tokens": 11002419.0,
|
|
"step": 6260
|
|
},
|
|
{
|
|
"entropy": 5.2475042819976805,
|
|
"epoch": 4.90987460815047,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0003115933621688488,
|
|
"loss": 4.8712,
|
|
"mean_token_accuracy": 0.19950293600559235,
|
|
"num_tokens": 11011271.0,
|
|
"step": 6265
|
|
},
|
|
{
|
|
"entropy": 5.36415843963623,
|
|
"epoch": 4.913793103448276,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00031129654351743816,
|
|
"loss": 4.8959,
|
|
"mean_token_accuracy": 0.1931706339120865,
|
|
"num_tokens": 11020461.0,
|
|
"step": 6270
|
|
},
|
|
{
|
|
"entropy": 5.360075616836548,
|
|
"epoch": 4.917711598746082,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00031099965999821906,
|
|
"loss": 4.8036,
|
|
"mean_token_accuracy": 0.20114846676588058,
|
|
"num_tokens": 11029251.0,
|
|
"step": 6275
|
|
},
|
|
{
|
|
"entropy": 5.301220798492432,
|
|
"epoch": 4.921630094043887,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0003107027121417704,
|
|
"loss": 4.8355,
|
|
"mean_token_accuracy": 0.20290125906467438,
|
|
"num_tokens": 11038149.0,
|
|
"step": 6280
|
|
},
|
|
{
|
|
"entropy": 5.328409194946289,
|
|
"epoch": 4.925548589341693,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00031040570047878634,
|
|
"loss": 4.9214,
|
|
"mean_token_accuracy": 0.19522561579942704,
|
|
"num_tokens": 11046262.0,
|
|
"step": 6285
|
|
},
|
|
{
|
|
"entropy": 5.343040227890015,
|
|
"epoch": 4.929467084639499,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00031010862554007473,
|
|
"loss": 4.9524,
|
|
"mean_token_accuracy": 0.18954694271087646,
|
|
"num_tokens": 11055755.0,
|
|
"step": 6290
|
|
},
|
|
{
|
|
"entropy": 5.284920644760132,
|
|
"epoch": 4.933385579937304,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0003098114878565567,
|
|
"loss": 4.8427,
|
|
"mean_token_accuracy": 0.19753293842077255,
|
|
"num_tokens": 11064534.0,
|
|
"step": 6295
|
|
},
|
|
{
|
|
"entropy": 5.303559684753418,
|
|
"epoch": 4.93730407523511,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00030951428795926536,
|
|
"loss": 4.9028,
|
|
"mean_token_accuracy": 0.19815993309020996,
|
|
"num_tokens": 11073768.0,
|
|
"step": 6300
|
|
},
|
|
{
|
|
"entropy": 5.288099098205566,
|
|
"epoch": 4.941222570532915,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0003092170263793452,
|
|
"loss": 4.8995,
|
|
"mean_token_accuracy": 0.20102884322404863,
|
|
"num_tokens": 11083683.0,
|
|
"step": 6305
|
|
},
|
|
{
|
|
"entropy": 5.333381271362304,
|
|
"epoch": 4.945141065830721,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00030891970364805096,
|
|
"loss": 4.9398,
|
|
"mean_token_accuracy": 0.19457490146160125,
|
|
"num_tokens": 11092368.0,
|
|
"step": 6310
|
|
},
|
|
{
|
|
"entropy": 5.244618320465088,
|
|
"epoch": 4.949059561128527,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0003086223202967464,
|
|
"loss": 4.8368,
|
|
"mean_token_accuracy": 0.2018519252538681,
|
|
"num_tokens": 11100544.0,
|
|
"step": 6315
|
|
},
|
|
{
|
|
"entropy": 5.286687612533569,
|
|
"epoch": 4.952978056426332,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.000308324876856904,
|
|
"loss": 4.8741,
|
|
"mean_token_accuracy": 0.19442498236894606,
|
|
"num_tokens": 11108962.0,
|
|
"step": 6320
|
|
},
|
|
{
|
|
"entropy": 5.36750898361206,
|
|
"epoch": 4.956896551724138,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00030802737386010314,
|
|
"loss": 4.9387,
|
|
"mean_token_accuracy": 0.19592072516679765,
|
|
"num_tokens": 11117011.0,
|
|
"step": 6325
|
|
},
|
|
{
|
|
"entropy": 5.312206840515136,
|
|
"epoch": 4.960815047021944,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0003077298118380301,
|
|
"loss": 4.8833,
|
|
"mean_token_accuracy": 0.197268608212471,
|
|
"num_tokens": 11126009.0,
|
|
"step": 6330
|
|
},
|
|
{
|
|
"entropy": 5.275107955932617,
|
|
"epoch": 4.964733542319749,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0003074321913224765,
|
|
"loss": 4.7779,
|
|
"mean_token_accuracy": 0.20743790715932847,
|
|
"num_tokens": 11134404.0,
|
|
"step": 6335
|
|
},
|
|
{
|
|
"entropy": 5.313373374938965,
|
|
"epoch": 4.968652037617555,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0003071345128453382,
|
|
"loss": 4.9146,
|
|
"mean_token_accuracy": 0.19585829079151154,
|
|
"num_tokens": 11142245.0,
|
|
"step": 6340
|
|
},
|
|
{
|
|
"entropy": 5.250147294998169,
|
|
"epoch": 4.972570532915361,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0003068367769386153,
|
|
"loss": 4.8945,
|
|
"mean_token_accuracy": 0.19975371211767196,
|
|
"num_tokens": 11150881.0,
|
|
"step": 6345
|
|
},
|
|
{
|
|
"entropy": 5.203444719314575,
|
|
"epoch": 4.976489028213166,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00030653898413440973,
|
|
"loss": 4.8321,
|
|
"mean_token_accuracy": 0.20126568675041198,
|
|
"num_tokens": 11159421.0,
|
|
"step": 6350
|
|
},
|
|
{
|
|
"entropy": 5.319747447967529,
|
|
"epoch": 4.980407523510972,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0003062411349649258,
|
|
"loss": 4.8974,
|
|
"mean_token_accuracy": 0.19624017924070358,
|
|
"num_tokens": 11168633.0,
|
|
"step": 6355
|
|
},
|
|
{
|
|
"entropy": 5.263225889205932,
|
|
"epoch": 4.984326018808778,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00030594322996246816,
|
|
"loss": 4.7415,
|
|
"mean_token_accuracy": 0.2043210431933403,
|
|
"num_tokens": 11177435.0,
|
|
"step": 6360
|
|
},
|
|
{
|
|
"entropy": 5.333282947540283,
|
|
"epoch": 4.988244514106583,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00030564526965944127,
|
|
"loss": 4.9665,
|
|
"mean_token_accuracy": 0.1936241254210472,
|
|
"num_tokens": 11186195.0,
|
|
"step": 6365
|
|
},
|
|
{
|
|
"entropy": 5.232795333862304,
|
|
"epoch": 4.992163009404389,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0003053472545883488,
|
|
"loss": 4.817,
|
|
"mean_token_accuracy": 0.20305003225803375,
|
|
"num_tokens": 11194087.0,
|
|
"step": 6370
|
|
},
|
|
{
|
|
"entropy": 5.247873401641845,
|
|
"epoch": 4.996081504702194,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00030504918528179164,
|
|
"loss": 4.8989,
|
|
"mean_token_accuracy": 0.19560456275939941,
|
|
"num_tokens": 11202677.0,
|
|
"step": 6375
|
|
},
|
|
{
|
|
"entropy": 5.338048839569092,
|
|
"epoch": 5.0,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00030475106227246824,
|
|
"loss": 4.9353,
|
|
"mean_token_accuracy": 0.1861883893609047,
|
|
"num_tokens": 11211560.0,
|
|
"step": 6380
|
|
},
|
|
{
|
|
"entropy": 5.28595609664917,
|
|
"epoch": 5.003918495297806,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0003044528860931726,
|
|
"loss": 4.646,
|
|
"mean_token_accuracy": 0.2191310092806816,
|
|
"num_tokens": 11220186.0,
|
|
"step": 6385
|
|
},
|
|
{
|
|
"entropy": 5.263968801498413,
|
|
"epoch": 5.007836990595611,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0003041546572767939,
|
|
"loss": 4.6643,
|
|
"mean_token_accuracy": 0.20860849022865297,
|
|
"num_tokens": 11229148.0,
|
|
"step": 6390
|
|
},
|
|
{
|
|
"entropy": 5.269822978973389,
|
|
"epoch": 5.011755485893417,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0003038563763563156,
|
|
"loss": 4.6011,
|
|
"mean_token_accuracy": 0.22276840656995772,
|
|
"num_tokens": 11237585.0,
|
|
"step": 6395
|
|
},
|
|
{
|
|
"entropy": 5.269498443603515,
|
|
"epoch": 5.015673981191223,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00030355804386481374,
|
|
"loss": 4.5603,
|
|
"mean_token_accuracy": 0.2151197999715805,
|
|
"num_tokens": 11246203.0,
|
|
"step": 6400
|
|
},
|
|
{
|
|
"entropy": 5.275403213500977,
|
|
"epoch": 5.019592476489028,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00030325966033545716,
|
|
"loss": 4.6792,
|
|
"mean_token_accuracy": 0.21304512470960618,
|
|
"num_tokens": 11254999.0,
|
|
"step": 6405
|
|
},
|
|
{
|
|
"entropy": 5.259046173095703,
|
|
"epoch": 5.023510971786834,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00030296122630150534,
|
|
"loss": 4.6159,
|
|
"mean_token_accuracy": 0.21581995934247972,
|
|
"num_tokens": 11264110.0,
|
|
"step": 6410
|
|
},
|
|
{
|
|
"entropy": 5.270573377609253,
|
|
"epoch": 5.027429467084639,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00030266274229630844,
|
|
"loss": 4.5706,
|
|
"mean_token_accuracy": 0.22306174039840698,
|
|
"num_tokens": 11273452.0,
|
|
"step": 6415
|
|
},
|
|
{
|
|
"entropy": 5.172800779342651,
|
|
"epoch": 5.031347962382445,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00030236420885330586,
|
|
"loss": 4.5841,
|
|
"mean_token_accuracy": 0.2129107117652893,
|
|
"num_tokens": 11281926.0,
|
|
"step": 6420
|
|
},
|
|
{
|
|
"entropy": 5.273963260650635,
|
|
"epoch": 5.035266457680251,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0003020656265060251,
|
|
"loss": 4.6501,
|
|
"mean_token_accuracy": 0.22558311969041825,
|
|
"num_tokens": 11290630.0,
|
|
"step": 6425
|
|
},
|
|
{
|
|
"entropy": 5.278858375549317,
|
|
"epoch": 5.039184952978056,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0003017669957880813,
|
|
"loss": 4.6469,
|
|
"mean_token_accuracy": 0.21591717451810838,
|
|
"num_tokens": 11299126.0,
|
|
"step": 6430
|
|
},
|
|
{
|
|
"entropy": 5.257981634140014,
|
|
"epoch": 5.043103448275862,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.000301468317233176,
|
|
"loss": 4.7469,
|
|
"mean_token_accuracy": 0.20141037106513976,
|
|
"num_tokens": 11308331.0,
|
|
"step": 6435
|
|
},
|
|
{
|
|
"entropy": 5.307056903839111,
|
|
"epoch": 5.047021943573668,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00030116959137509614,
|
|
"loss": 4.6431,
|
|
"mean_token_accuracy": 0.20703964978456496,
|
|
"num_tokens": 11316245.0,
|
|
"step": 6440
|
|
},
|
|
{
|
|
"entropy": 5.317541265487671,
|
|
"epoch": 5.050940438871473,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00030087081874771325,
|
|
"loss": 4.6386,
|
|
"mean_token_accuracy": 0.21203394383192062,
|
|
"num_tokens": 11324994.0,
|
|
"step": 6445
|
|
},
|
|
{
|
|
"entropy": 5.186508893966675,
|
|
"epoch": 5.054858934169279,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00030057199988498245,
|
|
"loss": 4.5873,
|
|
"mean_token_accuracy": 0.21664729416370393,
|
|
"num_tokens": 11333255.0,
|
|
"step": 6450
|
|
},
|
|
{
|
|
"entropy": 5.186954402923584,
|
|
"epoch": 5.058777429467085,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0003002731353209416,
|
|
"loss": 4.6272,
|
|
"mean_token_accuracy": 0.21368326246738434,
|
|
"num_tokens": 11341857.0,
|
|
"step": 6455
|
|
},
|
|
{
|
|
"entropy": 5.247381734848022,
|
|
"epoch": 5.06269592476489,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0002999742255897102,
|
|
"loss": 4.7333,
|
|
"mean_token_accuracy": 0.2015768766403198,
|
|
"num_tokens": 11350933.0,
|
|
"step": 6460
|
|
},
|
|
{
|
|
"entropy": 5.274549579620361,
|
|
"epoch": 5.066614420062696,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00029967527122548826,
|
|
"loss": 4.6175,
|
|
"mean_token_accuracy": 0.21129999905824662,
|
|
"num_tokens": 11360346.0,
|
|
"step": 6465
|
|
},
|
|
{
|
|
"entropy": 5.253450536727906,
|
|
"epoch": 5.070532915360501,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0002993762727625556,
|
|
"loss": 4.6798,
|
|
"mean_token_accuracy": 0.21015864759683608,
|
|
"num_tokens": 11369401.0,
|
|
"step": 6470
|
|
},
|
|
{
|
|
"entropy": 5.261023283004761,
|
|
"epoch": 5.074451410658307,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0002990772307352712,
|
|
"loss": 4.6326,
|
|
"mean_token_accuracy": 0.21183062791824342,
|
|
"num_tokens": 11377882.0,
|
|
"step": 6475
|
|
},
|
|
{
|
|
"entropy": 5.277976846694946,
|
|
"epoch": 5.078369905956113,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0002987781456780715,
|
|
"loss": 4.6373,
|
|
"mean_token_accuracy": 0.21320178359746933,
|
|
"num_tokens": 11386934.0,
|
|
"step": 6480
|
|
},
|
|
{
|
|
"entropy": 5.261706113815308,
|
|
"epoch": 5.082288401253918,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00029847901812547,
|
|
"loss": 4.6835,
|
|
"mean_token_accuracy": 0.20635850578546525,
|
|
"num_tokens": 11395929.0,
|
|
"step": 6485
|
|
},
|
|
{
|
|
"entropy": 5.272850894927979,
|
|
"epoch": 5.086206896551724,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00029817984861205626,
|
|
"loss": 4.6065,
|
|
"mean_token_accuracy": 0.21154761910438538,
|
|
"num_tokens": 11405664.0,
|
|
"step": 6490
|
|
},
|
|
{
|
|
"entropy": 5.244519472122192,
|
|
"epoch": 5.09012539184953,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0002978806376724945,
|
|
"loss": 4.7025,
|
|
"mean_token_accuracy": 0.2158430889248848,
|
|
"num_tokens": 11414667.0,
|
|
"step": 6495
|
|
},
|
|
{
|
|
"entropy": 5.260897636413574,
|
|
"epoch": 5.094043887147335,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00029758138584152333,
|
|
"loss": 4.6797,
|
|
"mean_token_accuracy": 0.20965487509965897,
|
|
"num_tokens": 11423907.0,
|
|
"step": 6500
|
|
},
|
|
{
|
|
"epoch": 5.094043887147335,
|
|
"eval_entropy": 5.166154653527016,
|
|
"eval_loss": 5.708652496337891,
|
|
"eval_mean_token_accuracy": 0.16755875503254491,
|
|
"eval_num_tokens": 11423907.0,
|
|
"eval_runtime": 2.8259,
|
|
"eval_samples_per_second": 1458.632,
|
|
"eval_steps_per_second": 182.594,
|
|
"step": 6500
|
|
},
|
|
{
|
|
"entropy": 5.301111888885498,
|
|
"epoch": 5.097962382445141,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00029728209365395433,
|
|
"loss": 4.6923,
|
|
"mean_token_accuracy": 0.21454016417264937,
|
|
"num_tokens": 11433824.0,
|
|
"step": 6505
|
|
},
|
|
{
|
|
"entropy": 5.303480529785157,
|
|
"epoch": 5.101880877742946,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00029698276164467105,
|
|
"loss": 4.595,
|
|
"mean_token_accuracy": 0.21750776022672652,
|
|
"num_tokens": 11442371.0,
|
|
"step": 6510
|
|
},
|
|
{
|
|
"entropy": 5.264154148101807,
|
|
"epoch": 5.105799373040752,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0002966833903486284,
|
|
"loss": 4.6364,
|
|
"mean_token_accuracy": 0.21112638413906099,
|
|
"num_tokens": 11451657.0,
|
|
"step": 6515
|
|
},
|
|
{
|
|
"entropy": 5.157643842697143,
|
|
"epoch": 5.109717868338558,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00029638398030085134,
|
|
"loss": 4.5782,
|
|
"mean_token_accuracy": 0.21440080553293228,
|
|
"num_tokens": 11460064.0,
|
|
"step": 6520
|
|
},
|
|
{
|
|
"entropy": 5.22798490524292,
|
|
"epoch": 5.113636363636363,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00029608453203643434,
|
|
"loss": 4.6705,
|
|
"mean_token_accuracy": 0.21046082079410552,
|
|
"num_tokens": 11469184.0,
|
|
"step": 6525
|
|
},
|
|
{
|
|
"entropy": 5.284279680252075,
|
|
"epoch": 5.117554858934169,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0002957850460905398,
|
|
"loss": 4.6037,
|
|
"mean_token_accuracy": 0.22673124670982361,
|
|
"num_tokens": 11478069.0,
|
|
"step": 6530
|
|
},
|
|
{
|
|
"entropy": 5.301330184936523,
|
|
"epoch": 5.121473354231975,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00029548552299839774,
|
|
"loss": 4.6446,
|
|
"mean_token_accuracy": 0.2147235542535782,
|
|
"num_tokens": 11486572.0,
|
|
"step": 6535
|
|
},
|
|
{
|
|
"entropy": 5.2706715106964115,
|
|
"epoch": 5.12539184952978,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0002951859632953046,
|
|
"loss": 4.6852,
|
|
"mean_token_accuracy": 0.2118826299905777,
|
|
"num_tokens": 11495857.0,
|
|
"step": 6540
|
|
},
|
|
{
|
|
"entropy": 5.244790172576904,
|
|
"epoch": 5.129310344827586,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00029488636751662196,
|
|
"loss": 4.668,
|
|
"mean_token_accuracy": 0.21142137944698333,
|
|
"num_tokens": 11504875.0,
|
|
"step": 6545
|
|
},
|
|
{
|
|
"entropy": 5.261630916595459,
|
|
"epoch": 5.133228840125392,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0002945867361977762,
|
|
"loss": 4.6649,
|
|
"mean_token_accuracy": 0.2080566480755806,
|
|
"num_tokens": 11514509.0,
|
|
"step": 6550
|
|
},
|
|
{
|
|
"entropy": 5.278762865066528,
|
|
"epoch": 5.137147335423197,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.00029428706987425705,
|
|
"loss": 4.7072,
|
|
"mean_token_accuracy": 0.20585841983556746,
|
|
"num_tokens": 11523046.0,
|
|
"step": 6555
|
|
},
|
|
{
|
|
"entropy": 5.226072978973389,
|
|
"epoch": 5.141065830721003,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0002939873690816167,
|
|
"loss": 4.5951,
|
|
"mean_token_accuracy": 0.2208260104060173,
|
|
"num_tokens": 11531270.0,
|
|
"step": 6560
|
|
},
|
|
{
|
|
"entropy": 5.254046487808227,
|
|
"epoch": 5.144984326018808,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0002936876343554692,
|
|
"loss": 4.6573,
|
|
"mean_token_accuracy": 0.20754229873418809,
|
|
"num_tokens": 11539722.0,
|
|
"step": 6565
|
|
},
|
|
{
|
|
"entropy": 5.270880126953125,
|
|
"epoch": 5.148902821316614,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.000293387866231489,
|
|
"loss": 4.6901,
|
|
"mean_token_accuracy": 0.21452855318784714,
|
|
"num_tokens": 11549614.0,
|
|
"step": 6570
|
|
},
|
|
{
|
|
"entropy": 5.188871955871582,
|
|
"epoch": 5.15282131661442,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00029308806524541047,
|
|
"loss": 4.5418,
|
|
"mean_token_accuracy": 0.21589773446321486,
|
|
"num_tokens": 11557674.0,
|
|
"step": 6575
|
|
},
|
|
{
|
|
"entropy": 5.200792551040649,
|
|
"epoch": 5.156739811912225,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00029278823193302643,
|
|
"loss": 4.6024,
|
|
"mean_token_accuracy": 0.2138206571340561,
|
|
"num_tokens": 11566691.0,
|
|
"step": 6580
|
|
},
|
|
{
|
|
"entropy": 5.266089534759521,
|
|
"epoch": 5.160658307210031,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.00029248836683018774,
|
|
"loss": 4.605,
|
|
"mean_token_accuracy": 0.21657195538282395,
|
|
"num_tokens": 11574824.0,
|
|
"step": 6585
|
|
},
|
|
{
|
|
"entropy": 5.213444519042969,
|
|
"epoch": 5.164576802507837,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.00029218847047280197,
|
|
"loss": 4.5956,
|
|
"mean_token_accuracy": 0.2115613877773285,
|
|
"num_tokens": 11582988.0,
|
|
"step": 6590
|
|
},
|
|
{
|
|
"entropy": 5.24198842048645,
|
|
"epoch": 5.168495297805642,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0002918885433968323,
|
|
"loss": 4.6904,
|
|
"mean_token_accuracy": 0.21076444089412688,
|
|
"num_tokens": 11591553.0,
|
|
"step": 6595
|
|
},
|
|
{
|
|
"entropy": 5.224826955795288,
|
|
"epoch": 5.172413793103448,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0002915885861382974,
|
|
"loss": 4.6938,
|
|
"mean_token_accuracy": 0.205668543279171,
|
|
"num_tokens": 11600347.0,
|
|
"step": 6600
|
|
},
|
|
{
|
|
"entropy": 5.248018455505371,
|
|
"epoch": 5.1763322884012535,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.00029128859923326935,
|
|
"loss": 4.6955,
|
|
"mean_token_accuracy": 0.21317073702812195,
|
|
"num_tokens": 11609217.0,
|
|
"step": 6605
|
|
},
|
|
{
|
|
"entropy": 5.295130729675293,
|
|
"epoch": 5.1802507836990594,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.00029098858321787336,
|
|
"loss": 4.6603,
|
|
"mean_token_accuracy": 0.21704728603363038,
|
|
"num_tokens": 11617355.0,
|
|
"step": 6610
|
|
},
|
|
{
|
|
"entropy": 5.29042329788208,
|
|
"epoch": 5.184169278996865,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00029068853862828677,
|
|
"loss": 4.6539,
|
|
"mean_token_accuracy": 0.21141608208417892,
|
|
"num_tokens": 11626152.0,
|
|
"step": 6615
|
|
},
|
|
{
|
|
"entropy": 5.204212045669555,
|
|
"epoch": 5.1880877742946705,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0002903884660007378,
|
|
"loss": 4.6171,
|
|
"mean_token_accuracy": 0.21352380514144897,
|
|
"num_tokens": 11634868.0,
|
|
"step": 6620
|
|
},
|
|
{
|
|
"entropy": 5.238558626174926,
|
|
"epoch": 5.1920062695924765,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00029008836587150505,
|
|
"loss": 4.7103,
|
|
"mean_token_accuracy": 0.20710874050855638,
|
|
"num_tokens": 11644199.0,
|
|
"step": 6625
|
|
},
|
|
{
|
|
"entropy": 5.326083040237426,
|
|
"epoch": 5.195924764890282,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0002897882387769159,
|
|
"loss": 4.7529,
|
|
"mean_token_accuracy": 0.20760505348443986,
|
|
"num_tokens": 11653227.0,
|
|
"step": 6630
|
|
},
|
|
{
|
|
"entropy": 5.186022710800171,
|
|
"epoch": 5.1998432601880875,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0002894880852533464,
|
|
"loss": 4.5599,
|
|
"mean_token_accuracy": 0.2236059531569481,
|
|
"num_tokens": 11661628.0,
|
|
"step": 6635
|
|
},
|
|
{
|
|
"entropy": 5.203079080581665,
|
|
"epoch": 5.2037617554858935,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00028918790583721936,
|
|
"loss": 4.6165,
|
|
"mean_token_accuracy": 0.21708841174840926,
|
|
"num_tokens": 11670616.0,
|
|
"step": 6640
|
|
},
|
|
{
|
|
"entropy": 5.189866018295288,
|
|
"epoch": 5.2076802507836994,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0002888877010650041,
|
|
"loss": 4.6165,
|
|
"mean_token_accuracy": 0.22511828392744065,
|
|
"num_tokens": 11679069.0,
|
|
"step": 6645
|
|
},
|
|
{
|
|
"entropy": 5.287440967559815,
|
|
"epoch": 5.2115987460815045,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00028858747147321527,
|
|
"loss": 4.634,
|
|
"mean_token_accuracy": 0.21522687673568724,
|
|
"num_tokens": 11688184.0,
|
|
"step": 6650
|
|
},
|
|
{
|
|
"entropy": 5.290709543228149,
|
|
"epoch": 5.2155172413793105,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0002882872175984118,
|
|
"loss": 4.6767,
|
|
"mean_token_accuracy": 0.20681101977825164,
|
|
"num_tokens": 11696389.0,
|
|
"step": 6655
|
|
},
|
|
{
|
|
"entropy": 5.247362279891968,
|
|
"epoch": 5.219435736677116,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0002879869399771962,
|
|
"loss": 4.6934,
|
|
"mean_token_accuracy": 0.2073291838169098,
|
|
"num_tokens": 11704881.0,
|
|
"step": 6660
|
|
},
|
|
{
|
|
"entropy": 5.265257167816162,
|
|
"epoch": 5.2233542319749215,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00028768663914621295,
|
|
"loss": 4.7351,
|
|
"mean_token_accuracy": 0.20862899273633956,
|
|
"num_tokens": 11713566.0,
|
|
"step": 6665
|
|
},
|
|
{
|
|
"entropy": 5.263144159317017,
|
|
"epoch": 5.2272727272727275,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0002873863156421486,
|
|
"loss": 4.6866,
|
|
"mean_token_accuracy": 0.2133264198899269,
|
|
"num_tokens": 11722427.0,
|
|
"step": 6670
|
|
},
|
|
{
|
|
"entropy": 5.2976202964782715,
|
|
"epoch": 5.231191222570533,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.00028708597000172984,
|
|
"loss": 4.6498,
|
|
"mean_token_accuracy": 0.21248123943805694,
|
|
"num_tokens": 11731010.0,
|
|
"step": 6675
|
|
},
|
|
{
|
|
"entropy": 5.217772912979126,
|
|
"epoch": 5.235109717868339,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0002867856027617229,
|
|
"loss": 4.694,
|
|
"mean_token_accuracy": 0.2125796914100647,
|
|
"num_tokens": 11739569.0,
|
|
"step": 6680
|
|
},
|
|
{
|
|
"entropy": 5.223019933700561,
|
|
"epoch": 5.2390282131661445,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00028648521445893304,
|
|
"loss": 4.6244,
|
|
"mean_token_accuracy": 0.21678661555051804,
|
|
"num_tokens": 11747687.0,
|
|
"step": 6685
|
|
},
|
|
{
|
|
"entropy": 5.251226329803467,
|
|
"epoch": 5.24294670846395,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0002861848056302026,
|
|
"loss": 4.7135,
|
|
"mean_token_accuracy": 0.20850612968206406,
|
|
"num_tokens": 11756713.0,
|
|
"step": 6690
|
|
},
|
|
{
|
|
"entropy": 5.238022899627685,
|
|
"epoch": 5.246865203761756,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00028588437681241106,
|
|
"loss": 4.6179,
|
|
"mean_token_accuracy": 0.21880824565887452,
|
|
"num_tokens": 11765625.0,
|
|
"step": 6695
|
|
},
|
|
{
|
|
"entropy": 5.276667547225952,
|
|
"epoch": 5.250783699059561,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00028558392854247333,
|
|
"loss": 4.7268,
|
|
"mean_token_accuracy": 0.20920634567737578,
|
|
"num_tokens": 11774594.0,
|
|
"step": 6700
|
|
},
|
|
{
|
|
"entropy": 5.22579345703125,
|
|
"epoch": 5.254702194357367,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0002852834613573391,
|
|
"loss": 4.6507,
|
|
"mean_token_accuracy": 0.21433300524950027,
|
|
"num_tokens": 11782983.0,
|
|
"step": 6705
|
|
},
|
|
{
|
|
"entropy": 5.228710508346557,
|
|
"epoch": 5.258620689655173,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0002849829757939922,
|
|
"loss": 4.6438,
|
|
"mean_token_accuracy": 0.2125426009297371,
|
|
"num_tokens": 11791940.0,
|
|
"step": 6710
|
|
},
|
|
{
|
|
"entropy": 5.269276475906372,
|
|
"epoch": 5.262539184952978,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0002846824723894487,
|
|
"loss": 4.6522,
|
|
"mean_token_accuracy": 0.21146156042814254,
|
|
"num_tokens": 11800357.0,
|
|
"step": 6715
|
|
},
|
|
{
|
|
"entropy": 5.263609075546265,
|
|
"epoch": 5.266457680250784,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0002843819516807572,
|
|
"loss": 4.706,
|
|
"mean_token_accuracy": 0.20991114974021913,
|
|
"num_tokens": 11809005.0,
|
|
"step": 6720
|
|
},
|
|
{
|
|
"entropy": 5.315962171554565,
|
|
"epoch": 5.27037617554859,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00028408141420499685,
|
|
"loss": 4.6645,
|
|
"mean_token_accuracy": 0.20767489373683928,
|
|
"num_tokens": 11818021.0,
|
|
"step": 6725
|
|
},
|
|
{
|
|
"entropy": 5.283413124084473,
|
|
"epoch": 5.274294670846395,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0002837808604992768,
|
|
"loss": 4.6731,
|
|
"mean_token_accuracy": 0.20781936049461364,
|
|
"num_tokens": 11826791.0,
|
|
"step": 6730
|
|
},
|
|
{
|
|
"entropy": 5.203449249267578,
|
|
"epoch": 5.278213166144201,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00028348029110073533,
|
|
"loss": 4.5859,
|
|
"mean_token_accuracy": 0.2139137178659439,
|
|
"num_tokens": 11835110.0,
|
|
"step": 6735
|
|
},
|
|
{
|
|
"entropy": 5.202353525161743,
|
|
"epoch": 5.282131661442007,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0002831797065465386,
|
|
"loss": 4.7204,
|
|
"mean_token_accuracy": 0.21090250164270402,
|
|
"num_tokens": 11842658.0,
|
|
"step": 6740
|
|
},
|
|
{
|
|
"entropy": 5.18989930152893,
|
|
"epoch": 5.286050156739812,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00028287910737388,
|
|
"loss": 4.6761,
|
|
"mean_token_accuracy": 0.21297577917575836,
|
|
"num_tokens": 11851028.0,
|
|
"step": 6745
|
|
},
|
|
{
|
|
"entropy": 5.291472387313843,
|
|
"epoch": 5.289968652037618,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.000282578494119979,
|
|
"loss": 4.6134,
|
|
"mean_token_accuracy": 0.22368789464235306,
|
|
"num_tokens": 11859471.0,
|
|
"step": 6750
|
|
},
|
|
{
|
|
"entropy": 5.226733255386352,
|
|
"epoch": 5.293887147335424,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.00028227786732208014,
|
|
"loss": 4.5789,
|
|
"mean_token_accuracy": 0.21355614066123962,
|
|
"num_tokens": 11868625.0,
|
|
"step": 6755
|
|
},
|
|
{
|
|
"entropy": 5.228630685806275,
|
|
"epoch": 5.297805642633229,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00028197722751745246,
|
|
"loss": 4.6347,
|
|
"mean_token_accuracy": 0.21364559531211852,
|
|
"num_tokens": 11877744.0,
|
|
"step": 6760
|
|
},
|
|
{
|
|
"entropy": 5.233176040649414,
|
|
"epoch": 5.301724137931035,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0002816765752433879,
|
|
"loss": 4.6083,
|
|
"mean_token_accuracy": 0.2157480463385582,
|
|
"num_tokens": 11886445.0,
|
|
"step": 6765
|
|
},
|
|
{
|
|
"entropy": 5.179307270050049,
|
|
"epoch": 5.30564263322884,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00028137591103720075,
|
|
"loss": 4.644,
|
|
"mean_token_accuracy": 0.21147758811712264,
|
|
"num_tokens": 11896040.0,
|
|
"step": 6770
|
|
},
|
|
{
|
|
"entropy": 5.172809505462647,
|
|
"epoch": 5.309561128526646,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0002810752354362267,
|
|
"loss": 4.6582,
|
|
"mean_token_accuracy": 0.20637594014406205,
|
|
"num_tokens": 11904996.0,
|
|
"step": 6775
|
|
},
|
|
{
|
|
"entropy": 5.25744252204895,
|
|
"epoch": 5.313479623824452,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.000280774548977822,
|
|
"loss": 4.6692,
|
|
"mean_token_accuracy": 0.2157742828130722,
|
|
"num_tokens": 11913357.0,
|
|
"step": 6780
|
|
},
|
|
{
|
|
"entropy": 5.261717319488525,
|
|
"epoch": 5.317398119122257,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00028047385219936196,
|
|
"loss": 4.6958,
|
|
"mean_token_accuracy": 0.21106449216604234,
|
|
"num_tokens": 11922196.0,
|
|
"step": 6785
|
|
},
|
|
{
|
|
"entropy": 5.160858631134033,
|
|
"epoch": 5.321316614420063,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00028017314563824044,
|
|
"loss": 4.5596,
|
|
"mean_token_accuracy": 0.2191374808549881,
|
|
"num_tokens": 11930682.0,
|
|
"step": 6790
|
|
},
|
|
{
|
|
"entropy": 5.150775814056397,
|
|
"epoch": 5.325235109717869,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00027987242983186896,
|
|
"loss": 4.663,
|
|
"mean_token_accuracy": 0.20993126332759857,
|
|
"num_tokens": 11939470.0,
|
|
"step": 6795
|
|
},
|
|
{
|
|
"entropy": 5.267625093460083,
|
|
"epoch": 5.329153605015674,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00027957170531767525,
|
|
"loss": 4.654,
|
|
"mean_token_accuracy": 0.2149309128522873,
|
|
"num_tokens": 11948183.0,
|
|
"step": 6800
|
|
},
|
|
{
|
|
"entropy": 5.288802146911621,
|
|
"epoch": 5.33307210031348,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00027927097263310294,
|
|
"loss": 4.6884,
|
|
"mean_token_accuracy": 0.21001980900764466,
|
|
"num_tokens": 11957027.0,
|
|
"step": 6805
|
|
},
|
|
{
|
|
"entropy": 5.232284927368164,
|
|
"epoch": 5.336990595611285,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0002789702323156099,
|
|
"loss": 4.6415,
|
|
"mean_token_accuracy": 0.21445426791906358,
|
|
"num_tokens": 11965871.0,
|
|
"step": 6810
|
|
},
|
|
{
|
|
"entropy": 5.283229780197144,
|
|
"epoch": 5.340909090909091,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00027866948490266815,
|
|
"loss": 4.7687,
|
|
"mean_token_accuracy": 0.19667116850614547,
|
|
"num_tokens": 11974384.0,
|
|
"step": 6815
|
|
},
|
|
{
|
|
"entropy": 5.284752607345581,
|
|
"epoch": 5.344827586206897,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0002783687309317618,
|
|
"loss": 4.6772,
|
|
"mean_token_accuracy": 0.2063450336456299,
|
|
"num_tokens": 11983892.0,
|
|
"step": 6820
|
|
},
|
|
{
|
|
"entropy": 5.334119081497192,
|
|
"epoch": 5.348746081504702,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0002780679709403871,
|
|
"loss": 4.7186,
|
|
"mean_token_accuracy": 0.21132294982671737,
|
|
"num_tokens": 11992990.0,
|
|
"step": 6825
|
|
},
|
|
{
|
|
"entropy": 5.140944242477417,
|
|
"epoch": 5.352664576802508,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00027776720546605086,
|
|
"loss": 4.5226,
|
|
"mean_token_accuracy": 0.2295491874217987,
|
|
"num_tokens": 12001679.0,
|
|
"step": 6830
|
|
},
|
|
{
|
|
"entropy": 5.213904523849488,
|
|
"epoch": 5.356583072100314,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0002774664350462697,
|
|
"loss": 4.6985,
|
|
"mean_token_accuracy": 0.2092220142483711,
|
|
"num_tokens": 12011300.0,
|
|
"step": 6835
|
|
},
|
|
{
|
|
"entropy": 5.205938100814819,
|
|
"epoch": 5.360501567398119,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00027716566021856933,
|
|
"loss": 4.6487,
|
|
"mean_token_accuracy": 0.21584032773971557,
|
|
"num_tokens": 12019776.0,
|
|
"step": 6840
|
|
},
|
|
{
|
|
"entropy": 5.205404472351074,
|
|
"epoch": 5.364420062695925,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.000276864881520483,
|
|
"loss": 4.6738,
|
|
"mean_token_accuracy": 0.218022920191288,
|
|
"num_tokens": 12028838.0,
|
|
"step": 6845
|
|
},
|
|
{
|
|
"entropy": 5.284903717041016,
|
|
"epoch": 5.368338557993731,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0002765640994895509,
|
|
"loss": 4.6315,
|
|
"mean_token_accuracy": 0.2172762408852577,
|
|
"num_tokens": 12036903.0,
|
|
"step": 6850
|
|
},
|
|
{
|
|
"entropy": 5.243964529037475,
|
|
"epoch": 5.372257053291536,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00027626331466331955,
|
|
"loss": 4.6302,
|
|
"mean_token_accuracy": 0.20949662774801253,
|
|
"num_tokens": 12045838.0,
|
|
"step": 6855
|
|
},
|
|
{
|
|
"entropy": 5.301427173614502,
|
|
"epoch": 5.376175548589342,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.00027596252757933995,
|
|
"loss": 4.7863,
|
|
"mean_token_accuracy": 0.200015589594841,
|
|
"num_tokens": 12055791.0,
|
|
"step": 6860
|
|
},
|
|
{
|
|
"entropy": 5.211847162246704,
|
|
"epoch": 5.380094043887147,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0002756617387751674,
|
|
"loss": 4.5837,
|
|
"mean_token_accuracy": 0.21535126566886903,
|
|
"num_tokens": 12064842.0,
|
|
"step": 6865
|
|
},
|
|
{
|
|
"entropy": 5.216354656219482,
|
|
"epoch": 5.384012539184953,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0002753609487883606,
|
|
"loss": 4.662,
|
|
"mean_token_accuracy": 0.21460348516702651,
|
|
"num_tokens": 12073487.0,
|
|
"step": 6870
|
|
},
|
|
{
|
|
"entropy": 5.174469423294068,
|
|
"epoch": 5.387931034482759,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00027506015815647965,
|
|
"loss": 4.6183,
|
|
"mean_token_accuracy": 0.20809555053710938,
|
|
"num_tokens": 12082000.0,
|
|
"step": 6875
|
|
},
|
|
{
|
|
"entropy": 5.231398487091065,
|
|
"epoch": 5.391849529780564,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00027475936741708636,
|
|
"loss": 4.6713,
|
|
"mean_token_accuracy": 0.20541920363903046,
|
|
"num_tokens": 12090347.0,
|
|
"step": 6880
|
|
},
|
|
{
|
|
"entropy": 5.233875417709351,
|
|
"epoch": 5.39576802507837,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0002744585771077425,
|
|
"loss": 4.6349,
|
|
"mean_token_accuracy": 0.2128530889749527,
|
|
"num_tokens": 12098340.0,
|
|
"step": 6885
|
|
},
|
|
{
|
|
"entropy": 5.213066530227661,
|
|
"epoch": 5.399686520376176,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00027415778776600913,
|
|
"loss": 4.6694,
|
|
"mean_token_accuracy": 0.2185819551348686,
|
|
"num_tokens": 12106771.0,
|
|
"step": 6890
|
|
},
|
|
{
|
|
"entropy": 5.299434757232666,
|
|
"epoch": 5.403605015673981,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0002738569999294456,
|
|
"loss": 4.6748,
|
|
"mean_token_accuracy": 0.20964342802762986,
|
|
"num_tokens": 12115436.0,
|
|
"step": 6895
|
|
},
|
|
{
|
|
"entropy": 5.209298515319825,
|
|
"epoch": 5.407523510971787,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0002735562141356085,
|
|
"loss": 4.5713,
|
|
"mean_token_accuracy": 0.22048606872558593,
|
|
"num_tokens": 12123939.0,
|
|
"step": 6900
|
|
},
|
|
{
|
|
"entropy": 5.334646844863892,
|
|
"epoch": 5.411442006269592,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0002732554309220509,
|
|
"loss": 4.7954,
|
|
"mean_token_accuracy": 0.2062287822365761,
|
|
"num_tokens": 12132803.0,
|
|
"step": 6905
|
|
},
|
|
{
|
|
"entropy": 5.220536470413208,
|
|
"epoch": 5.415360501567398,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0002729546508263211,
|
|
"loss": 4.7162,
|
|
"mean_token_accuracy": 0.20899704247713088,
|
|
"num_tokens": 12141875.0,
|
|
"step": 6910
|
|
},
|
|
{
|
|
"entropy": 5.26422004699707,
|
|
"epoch": 5.419278996865204,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0002726538743859618,
|
|
"loss": 4.639,
|
|
"mean_token_accuracy": 0.22007594704627992,
|
|
"num_tokens": 12150685.0,
|
|
"step": 6915
|
|
},
|
|
{
|
|
"entropy": 5.246354103088379,
|
|
"epoch": 5.423197492163009,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0002723531021385095,
|
|
"loss": 4.6368,
|
|
"mean_token_accuracy": 0.21628635078668595,
|
|
"num_tokens": 12159070.0,
|
|
"step": 6920
|
|
},
|
|
{
|
|
"entropy": 5.197935676574707,
|
|
"epoch": 5.427115987460815,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0002720523346214928,
|
|
"loss": 4.6579,
|
|
"mean_token_accuracy": 0.21571388691663743,
|
|
"num_tokens": 12167676.0,
|
|
"step": 6925
|
|
},
|
|
{
|
|
"entropy": 5.185682249069214,
|
|
"epoch": 5.431034482758621,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.00027175157237243204,
|
|
"loss": 4.6407,
|
|
"mean_token_accuracy": 0.21227844953536987,
|
|
"num_tokens": 12176222.0,
|
|
"step": 6930
|
|
},
|
|
{
|
|
"entropy": 5.276366472244263,
|
|
"epoch": 5.434952978056426,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0002714508159288382,
|
|
"loss": 4.663,
|
|
"mean_token_accuracy": 0.21908699870109558,
|
|
"num_tokens": 12185171.0,
|
|
"step": 6935
|
|
},
|
|
{
|
|
"entropy": 5.2900909900665285,
|
|
"epoch": 5.438871473354232,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0002711500658282118,
|
|
"loss": 4.7085,
|
|
"mean_token_accuracy": 0.21383791118860246,
|
|
"num_tokens": 12193875.0,
|
|
"step": 6940
|
|
},
|
|
{
|
|
"entropy": 5.160379123687744,
|
|
"epoch": 5.442789968652038,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00027084932260804193,
|
|
"loss": 4.648,
|
|
"mean_token_accuracy": 0.21514481604099273,
|
|
"num_tokens": 12202681.0,
|
|
"step": 6945
|
|
},
|
|
{
|
|
"entropy": 5.220508098602295,
|
|
"epoch": 5.446708463949843,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0002705485868058056,
|
|
"loss": 4.6447,
|
|
"mean_token_accuracy": 0.2140985444188118,
|
|
"num_tokens": 12210854.0,
|
|
"step": 6950
|
|
},
|
|
{
|
|
"entropy": 5.3195148468017575,
|
|
"epoch": 5.450626959247649,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00027024785895896644,
|
|
"loss": 4.6626,
|
|
"mean_token_accuracy": 0.2115140065550804,
|
|
"num_tokens": 12219643.0,
|
|
"step": 6955
|
|
},
|
|
{
|
|
"entropy": 5.263359928131104,
|
|
"epoch": 5.454545454545454,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00026994713960497383,
|
|
"loss": 4.6847,
|
|
"mean_token_accuracy": 0.2135186165571213,
|
|
"num_tokens": 12228767.0,
|
|
"step": 6960
|
|
},
|
|
{
|
|
"entropy": 5.248637437820435,
|
|
"epoch": 5.45846394984326,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00026964642928126197,
|
|
"loss": 4.7379,
|
|
"mean_token_accuracy": 0.2048916608095169,
|
|
"num_tokens": 12237468.0,
|
|
"step": 6965
|
|
},
|
|
{
|
|
"entropy": 5.207278108596801,
|
|
"epoch": 5.462382445141066,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.00026934572852524907,
|
|
"loss": 4.7079,
|
|
"mean_token_accuracy": 0.20973457843065263,
|
|
"num_tokens": 12246040.0,
|
|
"step": 6970
|
|
},
|
|
{
|
|
"entropy": 5.279171276092529,
|
|
"epoch": 5.466300940438871,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00026904503787433614,
|
|
"loss": 4.7913,
|
|
"mean_token_accuracy": 0.2033950075507164,
|
|
"num_tokens": 12254646.0,
|
|
"step": 6975
|
|
},
|
|
{
|
|
"entropy": 5.253938817977906,
|
|
"epoch": 5.470219435736677,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.000268744357865906,
|
|
"loss": 4.6697,
|
|
"mean_token_accuracy": 0.20897333174943925,
|
|
"num_tokens": 12263306.0,
|
|
"step": 6980
|
|
},
|
|
{
|
|
"entropy": 5.232559490203857,
|
|
"epoch": 5.474137931034483,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00026844368903732263,
|
|
"loss": 4.66,
|
|
"mean_token_accuracy": 0.21718827337026597,
|
|
"num_tokens": 12272498.0,
|
|
"step": 6985
|
|
},
|
|
{
|
|
"entropy": 5.156026697158813,
|
|
"epoch": 5.478056426332288,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00026814303192593015,
|
|
"loss": 4.5489,
|
|
"mean_token_accuracy": 0.2185846298933029,
|
|
"num_tokens": 12281592.0,
|
|
"step": 6990
|
|
},
|
|
{
|
|
"entropy": 5.247229671478271,
|
|
"epoch": 5.481974921630094,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00026784238706905136,
|
|
"loss": 4.6804,
|
|
"mean_token_accuracy": 0.21280920803546904,
|
|
"num_tokens": 12290112.0,
|
|
"step": 6995
|
|
},
|
|
{
|
|
"entropy": 5.280190658569336,
|
|
"epoch": 5.485893416927899,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0002675417550039874,
|
|
"loss": 4.7265,
|
|
"mean_token_accuracy": 0.2046452909708023,
|
|
"num_tokens": 12299868.0,
|
|
"step": 7000
|
|
},
|
|
{
|
|
"epoch": 5.485893416927899,
|
|
"eval_entropy": 5.1715009138565655,
|
|
"eval_loss": 5.689372539520264,
|
|
"eval_mean_token_accuracy": 0.16866252587227396,
|
|
"eval_num_tokens": 12299868.0,
|
|
"eval_runtime": 2.8305,
|
|
"eval_samples_per_second": 1456.278,
|
|
"eval_steps_per_second": 182.3,
|
|
"step": 7000
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 12750,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 10,
|
|
"save_steps": 500,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 1.6561008458496e+16,
|
|
"train_batch_size": 16,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|