Model: fpadovani/ind-latn-100mb-after-ppt-shuff-dyck-10mb-ckpt500_seed3407 Source: Original Platform
2057 lines
54 KiB
JSON
2057 lines
54 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 0.7836990595611285,
|
|
"eval_steps": 500,
|
|
"global_step": 1000,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"entropy": 4.789229106903076,
|
|
"epoch": 0.003918495297805642,
|
|
"grad_norm": 17.125,
|
|
"learning_rate": 2e-06,
|
|
"loss": 14.3537,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 9174.0,
|
|
"step": 5
|
|
},
|
|
{
|
|
"entropy": 4.8115012645721436,
|
|
"epoch": 0.007836990595611285,
|
|
"grad_norm": 19.625,
|
|
"learning_rate": 4.5e-06,
|
|
"loss": 14.2452,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 17790.0,
|
|
"step": 10
|
|
},
|
|
{
|
|
"entropy": 4.899150800704956,
|
|
"epoch": 0.011755485893416929,
|
|
"grad_norm": 24.25,
|
|
"learning_rate": 7e-06,
|
|
"loss": 13.9044,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 25850.0,
|
|
"step": 15
|
|
},
|
|
{
|
|
"entropy": 5.367580604553223,
|
|
"epoch": 0.01567398119122257,
|
|
"grad_norm": 32.5,
|
|
"learning_rate": 9.5e-06,
|
|
"loss": 13.1444,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 35194.0,
|
|
"step": 20
|
|
},
|
|
{
|
|
"entropy": 8.583788537979126,
|
|
"epoch": 0.019592476489028215,
|
|
"grad_norm": 7.5,
|
|
"learning_rate": 1.2e-05,
|
|
"loss": 11.3911,
|
|
"mean_token_accuracy": 0.00023256096756085753,
|
|
"num_tokens": 44218.0,
|
|
"step": 25
|
|
},
|
|
{
|
|
"entropy": 10.630141735076904,
|
|
"epoch": 0.023510971786833857,
|
|
"grad_norm": 3.234375,
|
|
"learning_rate": 1.4500000000000002e-05,
|
|
"loss": 10.7102,
|
|
"mean_token_accuracy": 0.014597209030762314,
|
|
"num_tokens": 53397.0,
|
|
"step": 30
|
|
},
|
|
{
|
|
"entropy": 10.695956897735595,
|
|
"epoch": 0.0274294670846395,
|
|
"grad_norm": 3.0,
|
|
"learning_rate": 1.7000000000000003e-05,
|
|
"loss": 10.4664,
|
|
"mean_token_accuracy": 0.01781447734683752,
|
|
"num_tokens": 62749.0,
|
|
"step": 35
|
|
},
|
|
{
|
|
"entropy": 10.673796558380127,
|
|
"epoch": 0.03134796238244514,
|
|
"grad_norm": 2.421875,
|
|
"learning_rate": 1.95e-05,
|
|
"loss": 10.1632,
|
|
"mean_token_accuracy": 0.0182854525744915,
|
|
"num_tokens": 71721.0,
|
|
"step": 40
|
|
},
|
|
{
|
|
"entropy": 10.631663513183593,
|
|
"epoch": 0.03526645768025078,
|
|
"grad_norm": 2.421875,
|
|
"learning_rate": 2.2e-05,
|
|
"loss": 9.8792,
|
|
"mean_token_accuracy": 0.03653257880359888,
|
|
"num_tokens": 79844.0,
|
|
"step": 45
|
|
},
|
|
{
|
|
"entropy": 10.549837970733643,
|
|
"epoch": 0.03918495297805643,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 2.4500000000000003e-05,
|
|
"loss": 9.7665,
|
|
"mean_token_accuracy": 0.04605128690600395,
|
|
"num_tokens": 88866.0,
|
|
"step": 50
|
|
},
|
|
{
|
|
"entropy": 10.509830379486084,
|
|
"epoch": 0.04310344827586207,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 2.7e-05,
|
|
"loss": 9.6605,
|
|
"mean_token_accuracy": 0.044031094387173654,
|
|
"num_tokens": 97918.0,
|
|
"step": 55
|
|
},
|
|
{
|
|
"entropy": 10.524475193023681,
|
|
"epoch": 0.047021943573667714,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 2.95e-05,
|
|
"loss": 9.5619,
|
|
"mean_token_accuracy": 0.04601282589137554,
|
|
"num_tokens": 107043.0,
|
|
"step": 60
|
|
},
|
|
{
|
|
"entropy": 10.512676334381103,
|
|
"epoch": 0.050940438871473356,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 3.2e-05,
|
|
"loss": 9.4987,
|
|
"mean_token_accuracy": 0.04643600396811962,
|
|
"num_tokens": 116000.0,
|
|
"step": 65
|
|
},
|
|
{
|
|
"entropy": 10.477371215820312,
|
|
"epoch": 0.054858934169279,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 3.4500000000000005e-05,
|
|
"loss": 9.4179,
|
|
"mean_token_accuracy": 0.04148977212607861,
|
|
"num_tokens": 124559.0,
|
|
"step": 70
|
|
},
|
|
{
|
|
"entropy": 10.470399188995362,
|
|
"epoch": 0.05877742946708464,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 3.7e-05,
|
|
"loss": 9.2945,
|
|
"mean_token_accuracy": 0.04952896051108837,
|
|
"num_tokens": 132868.0,
|
|
"step": 75
|
|
},
|
|
{
|
|
"entropy": 10.453096103668212,
|
|
"epoch": 0.06269592476489028,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 3.95e-05,
|
|
"loss": 9.2848,
|
|
"mean_token_accuracy": 0.05274602882564068,
|
|
"num_tokens": 141286.0,
|
|
"step": 80
|
|
},
|
|
{
|
|
"entropy": 10.431593227386475,
|
|
"epoch": 0.06661442006269593,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 4.2000000000000004e-05,
|
|
"loss": 9.1405,
|
|
"mean_token_accuracy": 0.05872356928884983,
|
|
"num_tokens": 150406.0,
|
|
"step": 85
|
|
},
|
|
{
|
|
"entropy": 10.36865291595459,
|
|
"epoch": 0.07053291536050156,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 4.45e-05,
|
|
"loss": 9.0678,
|
|
"mean_token_accuracy": 0.059385529905557635,
|
|
"num_tokens": 158770.0,
|
|
"step": 90
|
|
},
|
|
{
|
|
"entropy": 10.264866065979003,
|
|
"epoch": 0.07445141065830721,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 4.7000000000000004e-05,
|
|
"loss": 8.9633,
|
|
"mean_token_accuracy": 0.06288341507315635,
|
|
"num_tokens": 167763.0,
|
|
"step": 95
|
|
},
|
|
{
|
|
"entropy": 10.183263969421386,
|
|
"epoch": 0.07836990595611286,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 4.9500000000000004e-05,
|
|
"loss": 8.819,
|
|
"mean_token_accuracy": 0.0607046652585268,
|
|
"num_tokens": 177306.0,
|
|
"step": 100
|
|
},
|
|
{
|
|
"entropy": 10.144334697723389,
|
|
"epoch": 0.0822884012539185,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 5.2e-05,
|
|
"loss": 8.7349,
|
|
"mean_token_accuracy": 0.06028640605509281,
|
|
"num_tokens": 186014.0,
|
|
"step": 105
|
|
},
|
|
{
|
|
"entropy": 10.06214361190796,
|
|
"epoch": 0.08620689655172414,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 5.45e-05,
|
|
"loss": 8.5758,
|
|
"mean_token_accuracy": 0.06410923898220063,
|
|
"num_tokens": 194122.0,
|
|
"step": 110
|
|
},
|
|
{
|
|
"entropy": 9.952830028533935,
|
|
"epoch": 0.09012539184952978,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 5.7e-05,
|
|
"loss": 8.4698,
|
|
"mean_token_accuracy": 0.05936008468270302,
|
|
"num_tokens": 203097.0,
|
|
"step": 115
|
|
},
|
|
{
|
|
"entropy": 9.820581531524658,
|
|
"epoch": 0.09404388714733543,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 5.9499999999999996e-05,
|
|
"loss": 8.3405,
|
|
"mean_token_accuracy": 0.06221077479422092,
|
|
"num_tokens": 211413.0,
|
|
"step": 120
|
|
},
|
|
{
|
|
"entropy": 9.732498264312744,
|
|
"epoch": 0.09796238244514106,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 6.2e-05,
|
|
"loss": 8.2718,
|
|
"mean_token_accuracy": 0.061625415459275246,
|
|
"num_tokens": 220550.0,
|
|
"step": 125
|
|
},
|
|
{
|
|
"entropy": 9.504752349853515,
|
|
"epoch": 0.10188087774294671,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 6.450000000000001e-05,
|
|
"loss": 8.0995,
|
|
"mean_token_accuracy": 0.0649514563381672,
|
|
"num_tokens": 229197.0,
|
|
"step": 130
|
|
},
|
|
{
|
|
"entropy": 9.307702922821045,
|
|
"epoch": 0.10579937304075235,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 6.7e-05,
|
|
"loss": 8.0979,
|
|
"mean_token_accuracy": 0.05685936994850636,
|
|
"num_tokens": 238479.0,
|
|
"step": 135
|
|
},
|
|
{
|
|
"entropy": 9.162922954559326,
|
|
"epoch": 0.109717868338558,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 6.950000000000001e-05,
|
|
"loss": 7.9442,
|
|
"mean_token_accuracy": 0.059861503541469574,
|
|
"num_tokens": 246318.0,
|
|
"step": 140
|
|
},
|
|
{
|
|
"entropy": 8.96123743057251,
|
|
"epoch": 0.11363636363636363,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 7.2e-05,
|
|
"loss": 7.9513,
|
|
"mean_token_accuracy": 0.05959276556968689,
|
|
"num_tokens": 254783.0,
|
|
"step": 145
|
|
},
|
|
{
|
|
"entropy": 8.760778617858886,
|
|
"epoch": 0.11755485893416928,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 7.45e-05,
|
|
"loss": 7.7945,
|
|
"mean_token_accuracy": 0.06369670145213605,
|
|
"num_tokens": 263416.0,
|
|
"step": 150
|
|
},
|
|
{
|
|
"entropy": 8.68117027282715,
|
|
"epoch": 0.12147335423197492,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 7.7e-05,
|
|
"loss": 7.8147,
|
|
"mean_token_accuracy": 0.0631796333938837,
|
|
"num_tokens": 271930.0,
|
|
"step": 155
|
|
},
|
|
{
|
|
"entropy": 8.476777839660645,
|
|
"epoch": 0.12539184952978055,
|
|
"grad_norm": 0.78515625,
|
|
"learning_rate": 7.950000000000001e-05,
|
|
"loss": 7.7159,
|
|
"mean_token_accuracy": 0.06549291461706161,
|
|
"num_tokens": 280546.0,
|
|
"step": 160
|
|
},
|
|
{
|
|
"entropy": 8.308262157440186,
|
|
"epoch": 0.12931034482758622,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 8.2e-05,
|
|
"loss": 7.7078,
|
|
"mean_token_accuracy": 0.06602046675980092,
|
|
"num_tokens": 288813.0,
|
|
"step": 165
|
|
},
|
|
{
|
|
"entropy": 8.279962158203125,
|
|
"epoch": 0.13322884012539185,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 8.450000000000001e-05,
|
|
"loss": 7.6847,
|
|
"mean_token_accuracy": 0.06443305909633637,
|
|
"num_tokens": 297966.0,
|
|
"step": 170
|
|
},
|
|
{
|
|
"entropy": 8.152728843688966,
|
|
"epoch": 0.1371473354231975,
|
|
"grad_norm": 0.8671875,
|
|
"learning_rate": 8.7e-05,
|
|
"loss": 7.7467,
|
|
"mean_token_accuracy": 0.06189337000250816,
|
|
"num_tokens": 307135.0,
|
|
"step": 175
|
|
},
|
|
{
|
|
"entropy": 8.145699501037598,
|
|
"epoch": 0.14106583072100312,
|
|
"grad_norm": 0.890625,
|
|
"learning_rate": 8.95e-05,
|
|
"loss": 7.6581,
|
|
"mean_token_accuracy": 0.06488074697554111,
|
|
"num_tokens": 315546.0,
|
|
"step": 180
|
|
},
|
|
{
|
|
"entropy": 8.149786376953125,
|
|
"epoch": 0.14498432601880878,
|
|
"grad_norm": 0.89453125,
|
|
"learning_rate": 9.2e-05,
|
|
"loss": 7.6538,
|
|
"mean_token_accuracy": 0.06405953019857406,
|
|
"num_tokens": 323930.0,
|
|
"step": 185
|
|
},
|
|
{
|
|
"entropy": 7.983444690704346,
|
|
"epoch": 0.14890282131661442,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 9.45e-05,
|
|
"loss": 7.5166,
|
|
"mean_token_accuracy": 0.07129846066236496,
|
|
"num_tokens": 332419.0,
|
|
"step": 190
|
|
},
|
|
{
|
|
"entropy": 7.974339866638184,
|
|
"epoch": 0.15282131661442006,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 9.7e-05,
|
|
"loss": 7.6157,
|
|
"mean_token_accuracy": 0.06940566822886467,
|
|
"num_tokens": 341362.0,
|
|
"step": 195
|
|
},
|
|
{
|
|
"entropy": 7.973450374603272,
|
|
"epoch": 0.15673981191222572,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 9.95e-05,
|
|
"loss": 7.5198,
|
|
"mean_token_accuracy": 0.07214542552828788,
|
|
"num_tokens": 349395.0,
|
|
"step": 200
|
|
},
|
|
{
|
|
"entropy": 7.946638202667236,
|
|
"epoch": 0.16065830721003135,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.000102,
|
|
"loss": 7.545,
|
|
"mean_token_accuracy": 0.06696730926632881,
|
|
"num_tokens": 358413.0,
|
|
"step": 205
|
|
},
|
|
{
|
|
"entropy": 7.863241577148438,
|
|
"epoch": 0.164576802507837,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00010449999999999999,
|
|
"loss": 7.6316,
|
|
"mean_token_accuracy": 0.06982938721776008,
|
|
"num_tokens": 366489.0,
|
|
"step": 210
|
|
},
|
|
{
|
|
"entropy": 7.940403842926026,
|
|
"epoch": 0.16849529780564262,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000107,
|
|
"loss": 7.5023,
|
|
"mean_token_accuracy": 0.06740010716021061,
|
|
"num_tokens": 375335.0,
|
|
"step": 215
|
|
},
|
|
{
|
|
"entropy": 7.874079847335816,
|
|
"epoch": 0.1724137931034483,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0001095,
|
|
"loss": 7.5555,
|
|
"mean_token_accuracy": 0.07188675999641418,
|
|
"num_tokens": 384276.0,
|
|
"step": 220
|
|
},
|
|
{
|
|
"entropy": 7.911562585830689,
|
|
"epoch": 0.17633228840125392,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.000112,
|
|
"loss": 7.5929,
|
|
"mean_token_accuracy": 0.06714313849806786,
|
|
"num_tokens": 393571.0,
|
|
"step": 225
|
|
},
|
|
{
|
|
"entropy": 7.920306205749512,
|
|
"epoch": 0.18025078369905956,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0001145,
|
|
"loss": 7.5193,
|
|
"mean_token_accuracy": 0.07089398205280303,
|
|
"num_tokens": 401865.0,
|
|
"step": 230
|
|
},
|
|
{
|
|
"entropy": 7.848536252975464,
|
|
"epoch": 0.1841692789968652,
|
|
"grad_norm": 0.91015625,
|
|
"learning_rate": 0.00011700000000000001,
|
|
"loss": 7.4905,
|
|
"mean_token_accuracy": 0.07226377129554748,
|
|
"num_tokens": 410518.0,
|
|
"step": 235
|
|
},
|
|
{
|
|
"entropy": 7.869985485076905,
|
|
"epoch": 0.18808777429467086,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.00011949999999999999,
|
|
"loss": 7.4997,
|
|
"mean_token_accuracy": 0.07303371652960777,
|
|
"num_tokens": 419769.0,
|
|
"step": 240
|
|
},
|
|
{
|
|
"entropy": 7.837644481658936,
|
|
"epoch": 0.1920062695924765,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.000122,
|
|
"loss": 7.437,
|
|
"mean_token_accuracy": 0.0742616519331932,
|
|
"num_tokens": 428204.0,
|
|
"step": 245
|
|
},
|
|
{
|
|
"entropy": 7.897941255569458,
|
|
"epoch": 0.19592476489028213,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0001245,
|
|
"loss": 7.5267,
|
|
"mean_token_accuracy": 0.06978406608104706,
|
|
"num_tokens": 436594.0,
|
|
"step": 250
|
|
},
|
|
{
|
|
"entropy": 7.80897855758667,
|
|
"epoch": 0.19984326018808776,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.000127,
|
|
"loss": 7.3256,
|
|
"mean_token_accuracy": 0.07486266531050205,
|
|
"num_tokens": 444645.0,
|
|
"step": 255
|
|
},
|
|
{
|
|
"entropy": 7.8133704662323,
|
|
"epoch": 0.20376175548589343,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0001295,
|
|
"loss": 7.3726,
|
|
"mean_token_accuracy": 0.07738698273897171,
|
|
"num_tokens": 453016.0,
|
|
"step": 260
|
|
},
|
|
{
|
|
"entropy": 7.736161422729492,
|
|
"epoch": 0.20768025078369906,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.000132,
|
|
"loss": 7.4507,
|
|
"mean_token_accuracy": 0.06978621035814285,
|
|
"num_tokens": 462116.0,
|
|
"step": 265
|
|
},
|
|
{
|
|
"entropy": 7.664476203918457,
|
|
"epoch": 0.2115987460815047,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00013450000000000002,
|
|
"loss": 7.3961,
|
|
"mean_token_accuracy": 0.07115238644182682,
|
|
"num_tokens": 470807.0,
|
|
"step": 270
|
|
},
|
|
{
|
|
"entropy": 7.7677568912506105,
|
|
"epoch": 0.21551724137931033,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00013700000000000002,
|
|
"loss": 7.4818,
|
|
"mean_token_accuracy": 0.07023664973676205,
|
|
"num_tokens": 479592.0,
|
|
"step": 275
|
|
},
|
|
{
|
|
"entropy": 7.912611389160157,
|
|
"epoch": 0.219435736677116,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0001395,
|
|
"loss": 7.4068,
|
|
"mean_token_accuracy": 0.07160313390195369,
|
|
"num_tokens": 488107.0,
|
|
"step": 280
|
|
},
|
|
{
|
|
"entropy": 7.755217599868774,
|
|
"epoch": 0.22335423197492163,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00014199999999999998,
|
|
"loss": 7.4212,
|
|
"mean_token_accuracy": 0.07538670524954796,
|
|
"num_tokens": 496775.0,
|
|
"step": 285
|
|
},
|
|
{
|
|
"entropy": 7.762103033065796,
|
|
"epoch": 0.22727272727272727,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0001445,
|
|
"loss": 7.4447,
|
|
"mean_token_accuracy": 0.07036296911537647,
|
|
"num_tokens": 505415.0,
|
|
"step": 290
|
|
},
|
|
{
|
|
"entropy": 7.757038116455078,
|
|
"epoch": 0.23119122257053293,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000147,
|
|
"loss": 7.4344,
|
|
"mean_token_accuracy": 0.074312524497509,
|
|
"num_tokens": 514447.0,
|
|
"step": 295
|
|
},
|
|
{
|
|
"entropy": 7.7855620861053465,
|
|
"epoch": 0.23510971786833856,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0001495,
|
|
"loss": 7.4189,
|
|
"mean_token_accuracy": 0.07684484757483005,
|
|
"num_tokens": 522998.0,
|
|
"step": 300
|
|
},
|
|
{
|
|
"entropy": 7.774819326400757,
|
|
"epoch": 0.2390282131661442,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000152,
|
|
"loss": 7.3794,
|
|
"mean_token_accuracy": 0.07512850686907768,
|
|
"num_tokens": 531542.0,
|
|
"step": 305
|
|
},
|
|
{
|
|
"entropy": 7.7300177097320555,
|
|
"epoch": 0.24294670846394983,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00015450000000000001,
|
|
"loss": 7.3247,
|
|
"mean_token_accuracy": 0.07261879369616508,
|
|
"num_tokens": 540143.0,
|
|
"step": 310
|
|
},
|
|
{
|
|
"entropy": 7.773348236083985,
|
|
"epoch": 0.2468652037617555,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000157,
|
|
"loss": 7.4017,
|
|
"mean_token_accuracy": 0.07950926274061203,
|
|
"num_tokens": 549156.0,
|
|
"step": 315
|
|
},
|
|
{
|
|
"entropy": 7.7285737037658695,
|
|
"epoch": 0.2507836990595611,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0001595,
|
|
"loss": 7.2557,
|
|
"mean_token_accuracy": 0.07481630519032478,
|
|
"num_tokens": 557522.0,
|
|
"step": 320
|
|
},
|
|
{
|
|
"entropy": 7.654256391525268,
|
|
"epoch": 0.2547021943573668,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000162,
|
|
"loss": 7.331,
|
|
"mean_token_accuracy": 0.07940150126814842,
|
|
"num_tokens": 566650.0,
|
|
"step": 325
|
|
},
|
|
{
|
|
"entropy": 7.672131299972534,
|
|
"epoch": 0.25862068965517243,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00016450000000000001,
|
|
"loss": 7.2864,
|
|
"mean_token_accuracy": 0.07869702018797398,
|
|
"num_tokens": 576219.0,
|
|
"step": 330
|
|
},
|
|
{
|
|
"entropy": 7.713160848617553,
|
|
"epoch": 0.26253918495297807,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.00016700000000000002,
|
|
"loss": 7.374,
|
|
"mean_token_accuracy": 0.07567794360220433,
|
|
"num_tokens": 584304.0,
|
|
"step": 335
|
|
},
|
|
{
|
|
"entropy": 7.634349060058594,
|
|
"epoch": 0.2664576802507837,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00016950000000000003,
|
|
"loss": 7.3003,
|
|
"mean_token_accuracy": 0.07832697704434395,
|
|
"num_tokens": 593163.0,
|
|
"step": 340
|
|
},
|
|
{
|
|
"entropy": 7.6303457736969,
|
|
"epoch": 0.27037617554858934,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00017199999999999998,
|
|
"loss": 7.2164,
|
|
"mean_token_accuracy": 0.07754571028053761,
|
|
"num_tokens": 602077.0,
|
|
"step": 345
|
|
},
|
|
{
|
|
"entropy": 7.6355628490448,
|
|
"epoch": 0.274294670846395,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00017449999999999999,
|
|
"loss": 7.3284,
|
|
"mean_token_accuracy": 0.08122679404914379,
|
|
"num_tokens": 610009.0,
|
|
"step": 350
|
|
},
|
|
{
|
|
"entropy": 7.685596513748169,
|
|
"epoch": 0.2782131661442006,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000177,
|
|
"loss": 7.362,
|
|
"mean_token_accuracy": 0.07597106769680977,
|
|
"num_tokens": 619282.0,
|
|
"step": 355
|
|
},
|
|
{
|
|
"entropy": 7.619720935821533,
|
|
"epoch": 0.28213166144200624,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0001795,
|
|
"loss": 7.2893,
|
|
"mean_token_accuracy": 0.08018167689442635,
|
|
"num_tokens": 628138.0,
|
|
"step": 360
|
|
},
|
|
{
|
|
"entropy": 7.6796112060546875,
|
|
"epoch": 0.28605015673981193,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.000182,
|
|
"loss": 7.1745,
|
|
"mean_token_accuracy": 0.08669476807117463,
|
|
"num_tokens": 637021.0,
|
|
"step": 365
|
|
},
|
|
{
|
|
"entropy": 7.619413709640503,
|
|
"epoch": 0.28996865203761757,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0001845,
|
|
"loss": 7.3161,
|
|
"mean_token_accuracy": 0.07477690353989601,
|
|
"num_tokens": 646703.0,
|
|
"step": 370
|
|
},
|
|
{
|
|
"entropy": 7.623689222335815,
|
|
"epoch": 0.2938871473354232,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.000187,
|
|
"loss": 7.1981,
|
|
"mean_token_accuracy": 0.08060777708888053,
|
|
"num_tokens": 655616.0,
|
|
"step": 375
|
|
},
|
|
{
|
|
"entropy": 7.556978368759156,
|
|
"epoch": 0.29780564263322884,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0001895,
|
|
"loss": 7.1994,
|
|
"mean_token_accuracy": 0.08719867020845413,
|
|
"num_tokens": 663783.0,
|
|
"step": 380
|
|
},
|
|
{
|
|
"entropy": 7.5845866203308105,
|
|
"epoch": 0.3017241379310345,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.000192,
|
|
"loss": 7.2094,
|
|
"mean_token_accuracy": 0.08289245739579201,
|
|
"num_tokens": 671855.0,
|
|
"step": 385
|
|
},
|
|
{
|
|
"entropy": 7.527840566635132,
|
|
"epoch": 0.3056426332288401,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0001945,
|
|
"loss": 7.2219,
|
|
"mean_token_accuracy": 0.07747755497694016,
|
|
"num_tokens": 680981.0,
|
|
"step": 390
|
|
},
|
|
{
|
|
"entropy": 7.6136561870574955,
|
|
"epoch": 0.30956112852664575,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00019700000000000002,
|
|
"loss": 7.1491,
|
|
"mean_token_accuracy": 0.08336339518427849,
|
|
"num_tokens": 689294.0,
|
|
"step": 395
|
|
},
|
|
{
|
|
"entropy": 7.469813442230224,
|
|
"epoch": 0.31347962382445144,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00019950000000000002,
|
|
"loss": 7.1035,
|
|
"mean_token_accuracy": 0.08146922513842583,
|
|
"num_tokens": 697703.0,
|
|
"step": 400
|
|
},
|
|
{
|
|
"entropy": 7.550826740264893,
|
|
"epoch": 0.31739811912225707,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.000202,
|
|
"loss": 7.2372,
|
|
"mean_token_accuracy": 0.08058681413531303,
|
|
"num_tokens": 706792.0,
|
|
"step": 405
|
|
},
|
|
{
|
|
"entropy": 7.606830406188965,
|
|
"epoch": 0.3213166144200627,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00020449999999999998,
|
|
"loss": 7.1473,
|
|
"mean_token_accuracy": 0.08346155509352685,
|
|
"num_tokens": 715864.0,
|
|
"step": 410
|
|
},
|
|
{
|
|
"entropy": 7.3859583854675295,
|
|
"epoch": 0.32523510971786834,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.000207,
|
|
"loss": 7.1975,
|
|
"mean_token_accuracy": 0.0853593334555626,
|
|
"num_tokens": 723921.0,
|
|
"step": 415
|
|
},
|
|
{
|
|
"entropy": 7.5406107902526855,
|
|
"epoch": 0.329153605015674,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0002095,
|
|
"loss": 7.2071,
|
|
"mean_token_accuracy": 0.08046000376343727,
|
|
"num_tokens": 732797.0,
|
|
"step": 420
|
|
},
|
|
{
|
|
"entropy": 7.510403490066528,
|
|
"epoch": 0.3330721003134796,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.000212,
|
|
"loss": 7.0654,
|
|
"mean_token_accuracy": 0.0873202033340931,
|
|
"num_tokens": 741248.0,
|
|
"step": 425
|
|
},
|
|
{
|
|
"entropy": 7.501159954071045,
|
|
"epoch": 0.33699059561128525,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0002145,
|
|
"loss": 7.1615,
|
|
"mean_token_accuracy": 0.08190247714519501,
|
|
"num_tokens": 749766.0,
|
|
"step": 430
|
|
},
|
|
{
|
|
"entropy": 7.408373832702637,
|
|
"epoch": 0.3409090909090909,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00021700000000000002,
|
|
"loss": 7.1268,
|
|
"mean_token_accuracy": 0.08113668784499169,
|
|
"num_tokens": 758695.0,
|
|
"step": 435
|
|
},
|
|
{
|
|
"entropy": 7.44956521987915,
|
|
"epoch": 0.3448275862068966,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0002195,
|
|
"loss": 7.1248,
|
|
"mean_token_accuracy": 0.08192591443657875,
|
|
"num_tokens": 767624.0,
|
|
"step": 440
|
|
},
|
|
{
|
|
"entropy": 7.422909212112427,
|
|
"epoch": 0.3487460815047022,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000222,
|
|
"loss": 7.117,
|
|
"mean_token_accuracy": 0.0853099413216114,
|
|
"num_tokens": 776616.0,
|
|
"step": 445
|
|
},
|
|
{
|
|
"entropy": 7.365292644500732,
|
|
"epoch": 0.35266457680250785,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0002245,
|
|
"loss": 7.1317,
|
|
"mean_token_accuracy": 0.08413158729672432,
|
|
"num_tokens": 786147.0,
|
|
"step": 450
|
|
},
|
|
{
|
|
"entropy": 7.536469745635986,
|
|
"epoch": 0.3565830721003135,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00022700000000000002,
|
|
"loss": 7.2317,
|
|
"mean_token_accuracy": 0.08228531405329705,
|
|
"num_tokens": 795213.0,
|
|
"step": 455
|
|
},
|
|
{
|
|
"entropy": 7.461417722702026,
|
|
"epoch": 0.3605015673981191,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00022950000000000002,
|
|
"loss": 7.0349,
|
|
"mean_token_accuracy": 0.09094136133790016,
|
|
"num_tokens": 803118.0,
|
|
"step": 460
|
|
},
|
|
{
|
|
"entropy": 7.444038438796997,
|
|
"epoch": 0.36442006269592475,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00023200000000000003,
|
|
"loss": 7.0219,
|
|
"mean_token_accuracy": 0.09442275986075402,
|
|
"num_tokens": 811358.0,
|
|
"step": 465
|
|
},
|
|
{
|
|
"entropy": 7.324700260162354,
|
|
"epoch": 0.3683385579937304,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.00023449999999999998,
|
|
"loss": 7.0256,
|
|
"mean_token_accuracy": 0.08778790757060051,
|
|
"num_tokens": 819653.0,
|
|
"step": 470
|
|
},
|
|
{
|
|
"entropy": 7.2960240840911865,
|
|
"epoch": 0.3722570532915361,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.000237,
|
|
"loss": 7.0511,
|
|
"mean_token_accuracy": 0.08624262139201164,
|
|
"num_tokens": 828462.0,
|
|
"step": 475
|
|
},
|
|
{
|
|
"entropy": 7.437795686721802,
|
|
"epoch": 0.3761755485893417,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0002395,
|
|
"loss": 7.1429,
|
|
"mean_token_accuracy": 0.0912679947912693,
|
|
"num_tokens": 836204.0,
|
|
"step": 480
|
|
},
|
|
{
|
|
"entropy": 7.2959794998168945,
|
|
"epoch": 0.38009404388714735,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.000242,
|
|
"loss": 7.0169,
|
|
"mean_token_accuracy": 0.09246607050299645,
|
|
"num_tokens": 845032.0,
|
|
"step": 485
|
|
},
|
|
{
|
|
"entropy": 7.4119359970092775,
|
|
"epoch": 0.384012539184953,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0002445,
|
|
"loss": 7.0308,
|
|
"mean_token_accuracy": 0.08805579245090485,
|
|
"num_tokens": 853324.0,
|
|
"step": 490
|
|
},
|
|
{
|
|
"entropy": 7.404975366592407,
|
|
"epoch": 0.3879310344827586,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.000247,
|
|
"loss": 6.992,
|
|
"mean_token_accuracy": 0.1035026639699936,
|
|
"num_tokens": 861640.0,
|
|
"step": 495
|
|
},
|
|
{
|
|
"entropy": 7.385119247436523,
|
|
"epoch": 0.39184952978056425,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0002495,
|
|
"loss": 7.1744,
|
|
"mean_token_accuracy": 0.082430200278759,
|
|
"num_tokens": 870758.0,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.39184952978056425,
|
|
"eval_entropy": 7.167502074278603,
|
|
"eval_loss": 7.156619548797607,
|
|
"eval_mean_token_accuracy": 0.08891707594152684,
|
|
"eval_num_tokens": 870758.0,
|
|
"eval_runtime": 2.8546,
|
|
"eval_samples_per_second": 1444.004,
|
|
"eval_steps_per_second": 180.763,
|
|
"step": 500
|
|
},
|
|
{
|
|
"entropy": 7.3540655136108395,
|
|
"epoch": 0.3957680250783699,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.000252,
|
|
"loss": 7.0906,
|
|
"mean_token_accuracy": 0.0840302512049675,
|
|
"num_tokens": 879968.0,
|
|
"step": 505
|
|
},
|
|
{
|
|
"entropy": 7.318256664276123,
|
|
"epoch": 0.3996865203761755,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0002545,
|
|
"loss": 7.0471,
|
|
"mean_token_accuracy": 0.08631112575531005,
|
|
"num_tokens": 888804.0,
|
|
"step": 510
|
|
},
|
|
{
|
|
"entropy": 7.259663057327271,
|
|
"epoch": 0.4036050156739812,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.000257,
|
|
"loss": 7.0624,
|
|
"mean_token_accuracy": 0.08012468516826629,
|
|
"num_tokens": 898593.0,
|
|
"step": 515
|
|
},
|
|
{
|
|
"entropy": 7.2833295345306395,
|
|
"epoch": 0.40752351097178685,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0002595,
|
|
"loss": 7.0799,
|
|
"mean_token_accuracy": 0.08342506065964699,
|
|
"num_tokens": 906873.0,
|
|
"step": 520
|
|
},
|
|
{
|
|
"entropy": 7.3285657405853275,
|
|
"epoch": 0.4114420062695925,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000262,
|
|
"loss": 7.0359,
|
|
"mean_token_accuracy": 0.0934828281402588,
|
|
"num_tokens": 915080.0,
|
|
"step": 525
|
|
},
|
|
{
|
|
"entropy": 7.422465038299561,
|
|
"epoch": 0.4153605015673981,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00026450000000000003,
|
|
"loss": 7.0396,
|
|
"mean_token_accuracy": 0.08701496720314025,
|
|
"num_tokens": 923741.0,
|
|
"step": 530
|
|
},
|
|
{
|
|
"entropy": 7.216967153549194,
|
|
"epoch": 0.41927899686520376,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00026700000000000004,
|
|
"loss": 6.9459,
|
|
"mean_token_accuracy": 0.0904500350356102,
|
|
"num_tokens": 932455.0,
|
|
"step": 535
|
|
},
|
|
{
|
|
"entropy": 7.235203742980957,
|
|
"epoch": 0.4231974921630094,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00026950000000000005,
|
|
"loss": 6.9492,
|
|
"mean_token_accuracy": 0.09165697544813156,
|
|
"num_tokens": 941064.0,
|
|
"step": 540
|
|
},
|
|
{
|
|
"entropy": 7.23210301399231,
|
|
"epoch": 0.427115987460815,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00027200000000000005,
|
|
"loss": 6.9795,
|
|
"mean_token_accuracy": 0.0916426420211792,
|
|
"num_tokens": 950261.0,
|
|
"step": 545
|
|
},
|
|
{
|
|
"entropy": 7.305574369430542,
|
|
"epoch": 0.43103448275862066,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0002745,
|
|
"loss": 7.0461,
|
|
"mean_token_accuracy": 0.0905070275068283,
|
|
"num_tokens": 959151.0,
|
|
"step": 550
|
|
},
|
|
{
|
|
"entropy": 7.299721527099609,
|
|
"epoch": 0.43495297805642635,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000277,
|
|
"loss": 7.0677,
|
|
"mean_token_accuracy": 0.09062978066504002,
|
|
"num_tokens": 968441.0,
|
|
"step": 555
|
|
},
|
|
{
|
|
"entropy": 7.134230327606201,
|
|
"epoch": 0.438871473354232,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0002795,
|
|
"loss": 6.9371,
|
|
"mean_token_accuracy": 0.09018276557326317,
|
|
"num_tokens": 977058.0,
|
|
"step": 560
|
|
},
|
|
{
|
|
"entropy": 7.334470558166504,
|
|
"epoch": 0.4427899686520376,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00028199999999999997,
|
|
"loss": 6.9519,
|
|
"mean_token_accuracy": 0.08950636759400368,
|
|
"num_tokens": 986531.0,
|
|
"step": 565
|
|
},
|
|
{
|
|
"entropy": 7.123916816711426,
|
|
"epoch": 0.44670846394984326,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0002845,
|
|
"loss": 6.8108,
|
|
"mean_token_accuracy": 0.09824811816215515,
|
|
"num_tokens": 995081.0,
|
|
"step": 570
|
|
},
|
|
{
|
|
"entropy": 7.103513240814209,
|
|
"epoch": 0.4506269592476489,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.000287,
|
|
"loss": 6.8802,
|
|
"mean_token_accuracy": 0.09345417022705078,
|
|
"num_tokens": 1003459.0,
|
|
"step": 575
|
|
},
|
|
{
|
|
"entropy": 7.101778936386109,
|
|
"epoch": 0.45454545454545453,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0002895,
|
|
"loss": 6.9239,
|
|
"mean_token_accuracy": 0.09018066227436065,
|
|
"num_tokens": 1012420.0,
|
|
"step": 580
|
|
},
|
|
{
|
|
"entropy": 7.261321687698365,
|
|
"epoch": 0.45846394984326017,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.000292,
|
|
"loss": 6.9954,
|
|
"mean_token_accuracy": 0.09160361662507058,
|
|
"num_tokens": 1021198.0,
|
|
"step": 585
|
|
},
|
|
{
|
|
"entropy": 7.202180290222168,
|
|
"epoch": 0.46238244514106586,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0002945,
|
|
"loss": 6.9694,
|
|
"mean_token_accuracy": 0.09484207406640052,
|
|
"num_tokens": 1030023.0,
|
|
"step": 590
|
|
},
|
|
{
|
|
"entropy": 7.164184045791626,
|
|
"epoch": 0.4663009404388715,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.000297,
|
|
"loss": 6.8295,
|
|
"mean_token_accuracy": 0.0953464850783348,
|
|
"num_tokens": 1038824.0,
|
|
"step": 595
|
|
},
|
|
{
|
|
"entropy": 7.216883039474487,
|
|
"epoch": 0.4702194357366771,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0002995,
|
|
"loss": 7.0882,
|
|
"mean_token_accuracy": 0.09343515783548355,
|
|
"num_tokens": 1048207.0,
|
|
"step": 600
|
|
},
|
|
{
|
|
"entropy": 7.304975414276123,
|
|
"epoch": 0.47413793103448276,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000302,
|
|
"loss": 7.0559,
|
|
"mean_token_accuracy": 0.08962106555700303,
|
|
"num_tokens": 1057354.0,
|
|
"step": 605
|
|
},
|
|
{
|
|
"entropy": 7.198044776916504,
|
|
"epoch": 0.4780564263322884,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0003045,
|
|
"loss": 6.9893,
|
|
"mean_token_accuracy": 0.09330410435795784,
|
|
"num_tokens": 1066825.0,
|
|
"step": 610
|
|
},
|
|
{
|
|
"entropy": 7.137618494033814,
|
|
"epoch": 0.48197492163009403,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.000307,
|
|
"loss": 6.9155,
|
|
"mean_token_accuracy": 0.08996602892875671,
|
|
"num_tokens": 1075752.0,
|
|
"step": 615
|
|
},
|
|
{
|
|
"entropy": 7.007559776306152,
|
|
"epoch": 0.48589341692789967,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0003095,
|
|
"loss": 6.8323,
|
|
"mean_token_accuracy": 0.09399376884102821,
|
|
"num_tokens": 1084826.0,
|
|
"step": 620
|
|
},
|
|
{
|
|
"entropy": 7.223559427261352,
|
|
"epoch": 0.4898119122257053,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.000312,
|
|
"loss": 6.9666,
|
|
"mean_token_accuracy": 0.09359999522566795,
|
|
"num_tokens": 1093425.0,
|
|
"step": 625
|
|
},
|
|
{
|
|
"entropy": 7.108446407318115,
|
|
"epoch": 0.493730407523511,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0003145,
|
|
"loss": 6.9828,
|
|
"mean_token_accuracy": 0.089838757365942,
|
|
"num_tokens": 1102619.0,
|
|
"step": 630
|
|
},
|
|
{
|
|
"entropy": 7.145089435577392,
|
|
"epoch": 0.49764890282131663,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000317,
|
|
"loss": 6.8601,
|
|
"mean_token_accuracy": 0.09595804288983345,
|
|
"num_tokens": 1111337.0,
|
|
"step": 635
|
|
},
|
|
{
|
|
"entropy": 7.005008172988892,
|
|
"epoch": 0.5015673981191222,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0003195,
|
|
"loss": 6.7035,
|
|
"mean_token_accuracy": 0.0990886114537716,
|
|
"num_tokens": 1119381.0,
|
|
"step": 640
|
|
},
|
|
{
|
|
"entropy": 6.96934700012207,
|
|
"epoch": 0.5054858934169278,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.000322,
|
|
"loss": 6.7597,
|
|
"mean_token_accuracy": 0.09665322229266167,
|
|
"num_tokens": 1127825.0,
|
|
"step": 645
|
|
},
|
|
{
|
|
"entropy": 7.11965799331665,
|
|
"epoch": 0.5094043887147336,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00032450000000000003,
|
|
"loss": 6.8203,
|
|
"mean_token_accuracy": 0.09057366773486138,
|
|
"num_tokens": 1136750.0,
|
|
"step": 650
|
|
},
|
|
{
|
|
"entropy": 7.017684459686279,
|
|
"epoch": 0.5133228840125392,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00032700000000000003,
|
|
"loss": 6.8499,
|
|
"mean_token_accuracy": 0.09456580057740212,
|
|
"num_tokens": 1145739.0,
|
|
"step": 655
|
|
},
|
|
{
|
|
"entropy": 7.034306764602661,
|
|
"epoch": 0.5172413793103449,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00032950000000000004,
|
|
"loss": 6.8973,
|
|
"mean_token_accuracy": 0.09757498279213905,
|
|
"num_tokens": 1154415.0,
|
|
"step": 660
|
|
},
|
|
{
|
|
"entropy": 7.052440977096557,
|
|
"epoch": 0.5211598746081505,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00033200000000000005,
|
|
"loss": 6.7751,
|
|
"mean_token_accuracy": 0.09611302688717842,
|
|
"num_tokens": 1162480.0,
|
|
"step": 665
|
|
},
|
|
{
|
|
"entropy": 6.977913856506348,
|
|
"epoch": 0.5250783699059561,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00033450000000000005,
|
|
"loss": 6.8559,
|
|
"mean_token_accuracy": 0.09223495721817017,
|
|
"num_tokens": 1170774.0,
|
|
"step": 670
|
|
},
|
|
{
|
|
"entropy": 7.030881881713867,
|
|
"epoch": 0.5289968652037618,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000337,
|
|
"loss": 6.7924,
|
|
"mean_token_accuracy": 0.09683787003159523,
|
|
"num_tokens": 1179629.0,
|
|
"step": 675
|
|
},
|
|
{
|
|
"entropy": 7.053485012054443,
|
|
"epoch": 0.5329153605015674,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0003395,
|
|
"loss": 6.938,
|
|
"mean_token_accuracy": 0.09229604452848435,
|
|
"num_tokens": 1189111.0,
|
|
"step": 680
|
|
},
|
|
{
|
|
"entropy": 7.108199834823608,
|
|
"epoch": 0.536833855799373,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.000342,
|
|
"loss": 6.9495,
|
|
"mean_token_accuracy": 0.09128881692886352,
|
|
"num_tokens": 1198827.0,
|
|
"step": 685
|
|
},
|
|
{
|
|
"entropy": 7.092297840118408,
|
|
"epoch": 0.5407523510971787,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00034449999999999997,
|
|
"loss": 6.8482,
|
|
"mean_token_accuracy": 0.09970205947756768,
|
|
"num_tokens": 1207089.0,
|
|
"step": 690
|
|
},
|
|
{
|
|
"entropy": 7.057426071166992,
|
|
"epoch": 0.5446708463949843,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.000347,
|
|
"loss": 6.8435,
|
|
"mean_token_accuracy": 0.08852889537811279,
|
|
"num_tokens": 1216509.0,
|
|
"step": 695
|
|
},
|
|
{
|
|
"entropy": 6.900876426696778,
|
|
"epoch": 0.54858934169279,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003495,
|
|
"loss": 6.6756,
|
|
"mean_token_accuracy": 0.10320580378174782,
|
|
"num_tokens": 1225684.0,
|
|
"step": 700
|
|
},
|
|
{
|
|
"entropy": 7.005167055130005,
|
|
"epoch": 0.5525078369905956,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.000352,
|
|
"loss": 6.7057,
|
|
"mean_token_accuracy": 0.10480915755033493,
|
|
"num_tokens": 1233675.0,
|
|
"step": 705
|
|
},
|
|
{
|
|
"entropy": 6.892497873306274,
|
|
"epoch": 0.5564263322884012,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0003545,
|
|
"loss": 6.7091,
|
|
"mean_token_accuracy": 0.10067695155739784,
|
|
"num_tokens": 1242147.0,
|
|
"step": 710
|
|
},
|
|
{
|
|
"entropy": 6.934285736083984,
|
|
"epoch": 0.5603448275862069,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000357,
|
|
"loss": 6.8249,
|
|
"mean_token_accuracy": 0.09867035746574401,
|
|
"num_tokens": 1251127.0,
|
|
"step": 715
|
|
},
|
|
{
|
|
"entropy": 6.978139781951905,
|
|
"epoch": 0.5642633228840125,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003595,
|
|
"loss": 6.7775,
|
|
"mean_token_accuracy": 0.09740801975131035,
|
|
"num_tokens": 1260589.0,
|
|
"step": 720
|
|
},
|
|
{
|
|
"entropy": 7.05702314376831,
|
|
"epoch": 0.5681818181818182,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.000362,
|
|
"loss": 6.8522,
|
|
"mean_token_accuracy": 0.09754758477210998,
|
|
"num_tokens": 1269281.0,
|
|
"step": 725
|
|
},
|
|
{
|
|
"entropy": 6.951777076721191,
|
|
"epoch": 0.5721003134796239,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003645,
|
|
"loss": 6.7117,
|
|
"mean_token_accuracy": 0.09818840324878693,
|
|
"num_tokens": 1278033.0,
|
|
"step": 730
|
|
},
|
|
{
|
|
"entropy": 7.001934242248535,
|
|
"epoch": 0.5760188087774295,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.000367,
|
|
"loss": 6.7808,
|
|
"mean_token_accuracy": 0.10354246944189072,
|
|
"num_tokens": 1285847.0,
|
|
"step": 735
|
|
},
|
|
{
|
|
"entropy": 6.932463216781616,
|
|
"epoch": 0.5799373040752351,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0003695,
|
|
"loss": 6.7726,
|
|
"mean_token_accuracy": 0.09513568431138993,
|
|
"num_tokens": 1294770.0,
|
|
"step": 740
|
|
},
|
|
{
|
|
"entropy": 7.036710739135742,
|
|
"epoch": 0.5838557993730408,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.000372,
|
|
"loss": 6.9512,
|
|
"mean_token_accuracy": 0.0940048098564148,
|
|
"num_tokens": 1304282.0,
|
|
"step": 745
|
|
},
|
|
{
|
|
"entropy": 6.997774505615235,
|
|
"epoch": 0.5877742946708464,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0003745,
|
|
"loss": 6.7923,
|
|
"mean_token_accuracy": 0.09595935121178627,
|
|
"num_tokens": 1313218.0,
|
|
"step": 750
|
|
},
|
|
{
|
|
"entropy": 6.966546869277954,
|
|
"epoch": 0.591692789968652,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000377,
|
|
"loss": 6.754,
|
|
"mean_token_accuracy": 0.0985157236456871,
|
|
"num_tokens": 1322093.0,
|
|
"step": 755
|
|
},
|
|
{
|
|
"entropy": 6.83478045463562,
|
|
"epoch": 0.5956112852664577,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0003795,
|
|
"loss": 6.7857,
|
|
"mean_token_accuracy": 0.09971616193652152,
|
|
"num_tokens": 1330718.0,
|
|
"step": 760
|
|
},
|
|
{
|
|
"entropy": 6.954968690872192,
|
|
"epoch": 0.5995297805642633,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.000382,
|
|
"loss": 6.6457,
|
|
"mean_token_accuracy": 0.10431547313928605,
|
|
"num_tokens": 1339672.0,
|
|
"step": 765
|
|
},
|
|
{
|
|
"entropy": 6.861670970916748,
|
|
"epoch": 0.603448275862069,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0003845,
|
|
"loss": 6.7001,
|
|
"mean_token_accuracy": 0.10293109342455864,
|
|
"num_tokens": 1348587.0,
|
|
"step": 770
|
|
},
|
|
{
|
|
"entropy": 6.8885609149932865,
|
|
"epoch": 0.6073667711598746,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00038700000000000003,
|
|
"loss": 6.6826,
|
|
"mean_token_accuracy": 0.10689334198832512,
|
|
"num_tokens": 1357597.0,
|
|
"step": 775
|
|
},
|
|
{
|
|
"entropy": 6.883357572555542,
|
|
"epoch": 0.6112852664576802,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00038950000000000003,
|
|
"loss": 6.7744,
|
|
"mean_token_accuracy": 0.101459039747715,
|
|
"num_tokens": 1366246.0,
|
|
"step": 780
|
|
},
|
|
{
|
|
"entropy": 6.89829683303833,
|
|
"epoch": 0.6152037617554859,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00039200000000000004,
|
|
"loss": 6.6695,
|
|
"mean_token_accuracy": 0.10532263070344924,
|
|
"num_tokens": 1374664.0,
|
|
"step": 785
|
|
},
|
|
{
|
|
"entropy": 6.8325498580932615,
|
|
"epoch": 0.6191222570532915,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00039450000000000005,
|
|
"loss": 6.6889,
|
|
"mean_token_accuracy": 0.10108358785510063,
|
|
"num_tokens": 1382765.0,
|
|
"step": 790
|
|
},
|
|
{
|
|
"entropy": 6.7815876483917235,
|
|
"epoch": 0.6230407523510971,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00039700000000000005,
|
|
"loss": 6.6442,
|
|
"mean_token_accuracy": 0.10871021300554276,
|
|
"num_tokens": 1391708.0,
|
|
"step": 795
|
|
},
|
|
{
|
|
"entropy": 6.786308908462525,
|
|
"epoch": 0.6269592476489029,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0003995,
|
|
"loss": 6.814,
|
|
"mean_token_accuracy": 0.09444232732057571,
|
|
"num_tokens": 1401373.0,
|
|
"step": 800
|
|
},
|
|
{
|
|
"entropy": 6.926651859283448,
|
|
"epoch": 0.6308777429467085,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000402,
|
|
"loss": 6.6124,
|
|
"mean_token_accuracy": 0.10876154825091362,
|
|
"num_tokens": 1409702.0,
|
|
"step": 805
|
|
},
|
|
{
|
|
"entropy": 6.87204418182373,
|
|
"epoch": 0.6347962382445141,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004045,
|
|
"loss": 6.8084,
|
|
"mean_token_accuracy": 0.10008606985211373,
|
|
"num_tokens": 1419596.0,
|
|
"step": 810
|
|
},
|
|
{
|
|
"entropy": 6.807423734664917,
|
|
"epoch": 0.6387147335423198,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00040699999999999997,
|
|
"loss": 6.7461,
|
|
"mean_token_accuracy": 0.10076582729816437,
|
|
"num_tokens": 1428437.0,
|
|
"step": 815
|
|
},
|
|
{
|
|
"entropy": 6.9204614639282225,
|
|
"epoch": 0.6426332288401254,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004095,
|
|
"loss": 6.6547,
|
|
"mean_token_accuracy": 0.10060450211167335,
|
|
"num_tokens": 1436764.0,
|
|
"step": 820
|
|
},
|
|
{
|
|
"entropy": 6.812792015075684,
|
|
"epoch": 0.646551724137931,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.000412,
|
|
"loss": 6.7007,
|
|
"mean_token_accuracy": 0.10875345095992088,
|
|
"num_tokens": 1445755.0,
|
|
"step": 825
|
|
},
|
|
{
|
|
"entropy": 6.902109909057617,
|
|
"epoch": 0.6504702194357367,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004145,
|
|
"loss": 6.7627,
|
|
"mean_token_accuracy": 0.10212339907884598,
|
|
"num_tokens": 1454550.0,
|
|
"step": 830
|
|
},
|
|
{
|
|
"entropy": 6.757165718078613,
|
|
"epoch": 0.6543887147335423,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.000417,
|
|
"loss": 6.6529,
|
|
"mean_token_accuracy": 0.1076908752322197,
|
|
"num_tokens": 1463039.0,
|
|
"step": 835
|
|
},
|
|
{
|
|
"entropy": 6.920673799514771,
|
|
"epoch": 0.658307210031348,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004195,
|
|
"loss": 6.7125,
|
|
"mean_token_accuracy": 0.10059169679880142,
|
|
"num_tokens": 1472029.0,
|
|
"step": 840
|
|
},
|
|
{
|
|
"entropy": 6.742834091186523,
|
|
"epoch": 0.6622257053291536,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000422,
|
|
"loss": 6.723,
|
|
"mean_token_accuracy": 0.10695556625723839,
|
|
"num_tokens": 1481588.0,
|
|
"step": 845
|
|
},
|
|
{
|
|
"entropy": 6.7755883693695065,
|
|
"epoch": 0.6661442006269592,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004245,
|
|
"loss": 6.6131,
|
|
"mean_token_accuracy": 0.1035246841609478,
|
|
"num_tokens": 1489915.0,
|
|
"step": 850
|
|
},
|
|
{
|
|
"entropy": 6.87349271774292,
|
|
"epoch": 0.6700626959247649,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000427,
|
|
"loss": 6.7211,
|
|
"mean_token_accuracy": 0.10070185288786888,
|
|
"num_tokens": 1497920.0,
|
|
"step": 855
|
|
},
|
|
{
|
|
"entropy": 6.742019605636597,
|
|
"epoch": 0.6739811912225705,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004295,
|
|
"loss": 6.5571,
|
|
"mean_token_accuracy": 0.10514650270342826,
|
|
"num_tokens": 1506965.0,
|
|
"step": 860
|
|
},
|
|
{
|
|
"entropy": 6.755401372909546,
|
|
"epoch": 0.6778996865203761,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.000432,
|
|
"loss": 6.6626,
|
|
"mean_token_accuracy": 0.10886923670768738,
|
|
"num_tokens": 1516028.0,
|
|
"step": 865
|
|
},
|
|
{
|
|
"entropy": 6.824488735198974,
|
|
"epoch": 0.6818181818181818,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004345,
|
|
"loss": 6.6802,
|
|
"mean_token_accuracy": 0.10832962691783905,
|
|
"num_tokens": 1524579.0,
|
|
"step": 870
|
|
},
|
|
{
|
|
"entropy": 6.729337167739868,
|
|
"epoch": 0.6857366771159875,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000437,
|
|
"loss": 6.605,
|
|
"mean_token_accuracy": 0.1062053769826889,
|
|
"num_tokens": 1534285.0,
|
|
"step": 875
|
|
},
|
|
{
|
|
"entropy": 6.7189655780792235,
|
|
"epoch": 0.6896551724137931,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004395,
|
|
"loss": 6.5898,
|
|
"mean_token_accuracy": 0.10706395953893662,
|
|
"num_tokens": 1543701.0,
|
|
"step": 880
|
|
},
|
|
{
|
|
"entropy": 6.891758966445923,
|
|
"epoch": 0.6935736677115988,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.000442,
|
|
"loss": 6.8576,
|
|
"mean_token_accuracy": 0.09462928622961045,
|
|
"num_tokens": 1552715.0,
|
|
"step": 885
|
|
},
|
|
{
|
|
"entropy": 6.6106942653656,
|
|
"epoch": 0.6974921630094044,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004445,
|
|
"loss": 6.4728,
|
|
"mean_token_accuracy": 0.1057778999209404,
|
|
"num_tokens": 1561523.0,
|
|
"step": 890
|
|
},
|
|
{
|
|
"entropy": 6.685323572158813,
|
|
"epoch": 0.70141065830721,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.000447,
|
|
"loss": 6.6995,
|
|
"mean_token_accuracy": 0.10680059865117073,
|
|
"num_tokens": 1570057.0,
|
|
"step": 895
|
|
},
|
|
{
|
|
"entropy": 6.810699081420898,
|
|
"epoch": 0.7053291536050157,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00044950000000000003,
|
|
"loss": 6.5487,
|
|
"mean_token_accuracy": 0.10731169655919075,
|
|
"num_tokens": 1578795.0,
|
|
"step": 900
|
|
},
|
|
{
|
|
"entropy": 6.618034505844117,
|
|
"epoch": 0.7092476489028213,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00045200000000000004,
|
|
"loss": 6.627,
|
|
"mean_token_accuracy": 0.10445040464401245,
|
|
"num_tokens": 1588108.0,
|
|
"step": 905
|
|
},
|
|
{
|
|
"entropy": 6.834760522842407,
|
|
"epoch": 0.713166144200627,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00045450000000000004,
|
|
"loss": 6.5897,
|
|
"mean_token_accuracy": 0.10842868015170097,
|
|
"num_tokens": 1596545.0,
|
|
"step": 910
|
|
},
|
|
{
|
|
"entropy": 6.617375135421753,
|
|
"epoch": 0.7170846394984326,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00045700000000000005,
|
|
"loss": 6.5774,
|
|
"mean_token_accuracy": 0.11101481318473816,
|
|
"num_tokens": 1605098.0,
|
|
"step": 915
|
|
},
|
|
{
|
|
"entropy": 6.7286604881286625,
|
|
"epoch": 0.7210031347962382,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00045950000000000006,
|
|
"loss": 6.5928,
|
|
"mean_token_accuracy": 0.11244359910488129,
|
|
"num_tokens": 1613760.0,
|
|
"step": 920
|
|
},
|
|
{
|
|
"entropy": 6.685993957519531,
|
|
"epoch": 0.7249216300940439,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.000462,
|
|
"loss": 6.6311,
|
|
"mean_token_accuracy": 0.10528192594647408,
|
|
"num_tokens": 1622345.0,
|
|
"step": 925
|
|
},
|
|
{
|
|
"entropy": 6.825484371185302,
|
|
"epoch": 0.7288401253918495,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004645,
|
|
"loss": 6.6198,
|
|
"mean_token_accuracy": 0.10753775164484977,
|
|
"num_tokens": 1630706.0,
|
|
"step": 930
|
|
},
|
|
{
|
|
"entropy": 6.736838388442993,
|
|
"epoch": 0.7327586206896551,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.000467,
|
|
"loss": 6.6851,
|
|
"mean_token_accuracy": 0.10634701699018478,
|
|
"num_tokens": 1639880.0,
|
|
"step": 935
|
|
},
|
|
{
|
|
"entropy": 6.709425592422486,
|
|
"epoch": 0.7366771159874608,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004695,
|
|
"loss": 6.5189,
|
|
"mean_token_accuracy": 0.10119672417640686,
|
|
"num_tokens": 1648325.0,
|
|
"step": 940
|
|
},
|
|
{
|
|
"entropy": 6.547716999053955,
|
|
"epoch": 0.7405956112852664,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.000472,
|
|
"loss": 6.4928,
|
|
"mean_token_accuracy": 0.11216101795434952,
|
|
"num_tokens": 1656951.0,
|
|
"step": 945
|
|
},
|
|
{
|
|
"entropy": 6.6757955074310305,
|
|
"epoch": 0.7445141065830722,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004745,
|
|
"loss": 6.6264,
|
|
"mean_token_accuracy": 0.1065959431231022,
|
|
"num_tokens": 1665832.0,
|
|
"step": 950
|
|
},
|
|
{
|
|
"entropy": 6.653597450256347,
|
|
"epoch": 0.7484326018808778,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.000477,
|
|
"loss": 6.5458,
|
|
"mean_token_accuracy": 0.10681739151477813,
|
|
"num_tokens": 1675116.0,
|
|
"step": 955
|
|
},
|
|
{
|
|
"entropy": 6.55257477760315,
|
|
"epoch": 0.7523510971786834,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004795,
|
|
"loss": 6.5317,
|
|
"mean_token_accuracy": 0.11514699757099152,
|
|
"num_tokens": 1684195.0,
|
|
"step": 960
|
|
},
|
|
{
|
|
"entropy": 6.689774370193481,
|
|
"epoch": 0.7562695924764891,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000482,
|
|
"loss": 6.4895,
|
|
"mean_token_accuracy": 0.11039396822452545,
|
|
"num_tokens": 1693025.0,
|
|
"step": 965
|
|
},
|
|
{
|
|
"entropy": 6.481554937362671,
|
|
"epoch": 0.7601880877742947,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004845,
|
|
"loss": 6.4836,
|
|
"mean_token_accuracy": 0.10764000788331032,
|
|
"num_tokens": 1701210.0,
|
|
"step": 970
|
|
},
|
|
{
|
|
"entropy": 6.6965916633605955,
|
|
"epoch": 0.7641065830721003,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000487,
|
|
"loss": 6.5847,
|
|
"mean_token_accuracy": 0.10125251486897469,
|
|
"num_tokens": 1711026.0,
|
|
"step": 975
|
|
},
|
|
{
|
|
"entropy": 6.65680046081543,
|
|
"epoch": 0.768025078369906,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004895,
|
|
"loss": 6.5874,
|
|
"mean_token_accuracy": 0.10824618190526962,
|
|
"num_tokens": 1719441.0,
|
|
"step": 980
|
|
},
|
|
{
|
|
"entropy": 6.599896430969238,
|
|
"epoch": 0.7719435736677116,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000492,
|
|
"loss": 6.5849,
|
|
"mean_token_accuracy": 0.10633337944746017,
|
|
"num_tokens": 1728300.0,
|
|
"step": 985
|
|
},
|
|
{
|
|
"entropy": 6.620725393295288,
|
|
"epoch": 0.7758620689655172,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004945,
|
|
"loss": 6.4769,
|
|
"mean_token_accuracy": 0.11418513432145119,
|
|
"num_tokens": 1736364.0,
|
|
"step": 990
|
|
},
|
|
{
|
|
"entropy": 6.466502714157104,
|
|
"epoch": 0.7797805642633229,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000497,
|
|
"loss": 6.3224,
|
|
"mean_token_accuracy": 0.1263233445584774,
|
|
"num_tokens": 1745043.0,
|
|
"step": 995
|
|
},
|
|
{
|
|
"entropy": 6.654029464721679,
|
|
"epoch": 0.7836990595611285,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004995,
|
|
"loss": 6.4667,
|
|
"mean_token_accuracy": 0.11487501338124276,
|
|
"num_tokens": 1754008.0,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 0.7836990595611285,
|
|
"eval_entropy": 6.3669652088668,
|
|
"eval_loss": 6.630230903625488,
|
|
"eval_mean_token_accuracy": 0.11402813254227472,
|
|
"eval_num_tokens": 1754008.0,
|
|
"eval_runtime": 2.8378,
|
|
"eval_samples_per_second": 1452.509,
|
|
"eval_steps_per_second": 181.828,
|
|
"step": 1000
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 12750,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 10,
|
|
"save_steps": 500,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 2371902916608000.0,
|
|
"train_batch_size": 16,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|