Model: fpadovani/hin-deva-100mb-after-ppt-shuff-dyck-100mb-ckpt500_seed3407 Source: Original Platform
12057 lines
329 KiB
JSON
12057 lines
329 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 0.5763688760806917,
|
|
"eval_steps": 3000,
|
|
"global_step": 6000,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"entropy": 4.81198468208313,
|
|
"epoch": 0.0004803073967339097,
|
|
"grad_norm": 15.3125,
|
|
"learning_rate": 2e-06,
|
|
"loss": 14.3995,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 10855.0,
|
|
"step": 5
|
|
},
|
|
{
|
|
"entropy": 4.828950214385986,
|
|
"epoch": 0.0009606147934678194,
|
|
"grad_norm": 16.0,
|
|
"learning_rate": 4.5e-06,
|
|
"loss": 14.4568,
|
|
"mean_token_accuracy": 6.361323175951838e-05,
|
|
"num_tokens": 24110.0,
|
|
"step": 10
|
|
},
|
|
{
|
|
"entropy": 4.885565328598022,
|
|
"epoch": 0.001440922190201729,
|
|
"grad_norm": 18.375,
|
|
"learning_rate": 7e-06,
|
|
"loss": 14.1468,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 35984.0,
|
|
"step": 15
|
|
},
|
|
{
|
|
"entropy": 5.113980484008789,
|
|
"epoch": 0.0019212295869356388,
|
|
"grad_norm": 25.5,
|
|
"learning_rate": 9.5e-06,
|
|
"loss": 13.5274,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 48152.0,
|
|
"step": 20
|
|
},
|
|
{
|
|
"entropy": 7.0846137523651125,
|
|
"epoch": 0.0024015369836695487,
|
|
"grad_norm": 18.875,
|
|
"learning_rate": 1.2e-05,
|
|
"loss": 11.983,
|
|
"mean_token_accuracy": 5.9031875571236016e-05,
|
|
"num_tokens": 59810.0,
|
|
"step": 25
|
|
},
|
|
{
|
|
"entropy": 10.311653995513916,
|
|
"epoch": 0.002881844380403458,
|
|
"grad_norm": 3.25,
|
|
"learning_rate": 1.4500000000000002e-05,
|
|
"loss": 10.8966,
|
|
"mean_token_accuracy": 0.0035814862465485932,
|
|
"num_tokens": 70852.0,
|
|
"step": 30
|
|
},
|
|
{
|
|
"entropy": 10.698549842834472,
|
|
"epoch": 0.0033621517771373678,
|
|
"grad_norm": 3.453125,
|
|
"learning_rate": 1.7000000000000003e-05,
|
|
"loss": 10.681,
|
|
"mean_token_accuracy": 0.012990868836641311,
|
|
"num_tokens": 83378.0,
|
|
"step": 35
|
|
},
|
|
{
|
|
"entropy": 10.70135440826416,
|
|
"epoch": 0.0038424591738712775,
|
|
"grad_norm": 2.890625,
|
|
"learning_rate": 1.95e-05,
|
|
"loss": 10.3702,
|
|
"mean_token_accuracy": 0.015855902433395387,
|
|
"num_tokens": 95505.0,
|
|
"step": 40
|
|
},
|
|
{
|
|
"entropy": 10.669420051574708,
|
|
"epoch": 0.004322766570605188,
|
|
"grad_norm": 2.609375,
|
|
"learning_rate": 2.2e-05,
|
|
"loss": 10.0399,
|
|
"mean_token_accuracy": 0.019150405284017326,
|
|
"num_tokens": 106812.0,
|
|
"step": 45
|
|
},
|
|
{
|
|
"entropy": 10.626140022277832,
|
|
"epoch": 0.004803073967339097,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 2.4500000000000003e-05,
|
|
"loss": 9.8531,
|
|
"mean_token_accuracy": 0.030371082201600074,
|
|
"num_tokens": 118572.0,
|
|
"step": 50
|
|
},
|
|
{
|
|
"entropy": 10.630718421936034,
|
|
"epoch": 0.005283381364073006,
|
|
"grad_norm": 2.140625,
|
|
"learning_rate": 2.7e-05,
|
|
"loss": 9.7085,
|
|
"mean_token_accuracy": 0.02918087989091873,
|
|
"num_tokens": 130051.0,
|
|
"step": 55
|
|
},
|
|
{
|
|
"entropy": 10.632691478729248,
|
|
"epoch": 0.005763688760806916,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 2.95e-05,
|
|
"loss": 9.6316,
|
|
"mean_token_accuracy": 0.033551334962248804,
|
|
"num_tokens": 141920.0,
|
|
"step": 60
|
|
},
|
|
{
|
|
"entropy": 10.621756076812744,
|
|
"epoch": 0.006243996157540826,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 3.2e-05,
|
|
"loss": 9.4968,
|
|
"mean_token_accuracy": 0.03377603869885206,
|
|
"num_tokens": 152706.0,
|
|
"step": 65
|
|
},
|
|
{
|
|
"entropy": 10.59926996231079,
|
|
"epoch": 0.0067243035542747355,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 3.4500000000000005e-05,
|
|
"loss": 9.4671,
|
|
"mean_token_accuracy": 0.030284658074378967,
|
|
"num_tokens": 165253.0,
|
|
"step": 70
|
|
},
|
|
{
|
|
"entropy": 10.586241340637207,
|
|
"epoch": 0.007204610951008645,
|
|
"grad_norm": 1.9921875,
|
|
"learning_rate": 3.7e-05,
|
|
"loss": 9.3528,
|
|
"mean_token_accuracy": 0.03066213186830282,
|
|
"num_tokens": 176708.0,
|
|
"step": 75
|
|
},
|
|
{
|
|
"entropy": 10.572576808929444,
|
|
"epoch": 0.007684918347742555,
|
|
"grad_norm": 1.9921875,
|
|
"learning_rate": 3.95e-05,
|
|
"loss": 9.3119,
|
|
"mean_token_accuracy": 0.02979854876175523,
|
|
"num_tokens": 188240.0,
|
|
"step": 80
|
|
},
|
|
{
|
|
"entropy": 10.554954528808594,
|
|
"epoch": 0.008165225744476465,
|
|
"grad_norm": 1.96875,
|
|
"learning_rate": 4.2000000000000004e-05,
|
|
"loss": 9.1145,
|
|
"mean_token_accuracy": 0.03125303704291582,
|
|
"num_tokens": 198355.0,
|
|
"step": 85
|
|
},
|
|
{
|
|
"entropy": 10.53057928085327,
|
|
"epoch": 0.008645533141210375,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 4.45e-05,
|
|
"loss": 9.0646,
|
|
"mean_token_accuracy": 0.02982727512717247,
|
|
"num_tokens": 209497.0,
|
|
"step": 90
|
|
},
|
|
{
|
|
"entropy": 10.494773197174073,
|
|
"epoch": 0.009125840537944284,
|
|
"grad_norm": 1.9609375,
|
|
"learning_rate": 4.7000000000000004e-05,
|
|
"loss": 8.9936,
|
|
"mean_token_accuracy": 0.02780488096177578,
|
|
"num_tokens": 220859.0,
|
|
"step": 95
|
|
},
|
|
{
|
|
"entropy": 10.448780918121338,
|
|
"epoch": 0.009606147934678195,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 4.9500000000000004e-05,
|
|
"loss": 8.9232,
|
|
"mean_token_accuracy": 0.030998879671096803,
|
|
"num_tokens": 231550.0,
|
|
"step": 100
|
|
},
|
|
{
|
|
"entropy": 10.376792049407959,
|
|
"epoch": 0.010086455331412104,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 5.2e-05,
|
|
"loss": 8.7452,
|
|
"mean_token_accuracy": 0.030790003202855586,
|
|
"num_tokens": 244210.0,
|
|
"step": 105
|
|
},
|
|
{
|
|
"entropy": 10.282748031616212,
|
|
"epoch": 0.010566762728146013,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 5.45e-05,
|
|
"loss": 8.6175,
|
|
"mean_token_accuracy": 0.040817446634173395,
|
|
"num_tokens": 255745.0,
|
|
"step": 110
|
|
},
|
|
{
|
|
"entropy": 10.166150856018067,
|
|
"epoch": 0.011047070124879923,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 5.7e-05,
|
|
"loss": 8.5074,
|
|
"mean_token_accuracy": 0.0365377115085721,
|
|
"num_tokens": 266180.0,
|
|
"step": 115
|
|
},
|
|
{
|
|
"entropy": 10.028709888458252,
|
|
"epoch": 0.011527377521613832,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 5.9499999999999996e-05,
|
|
"loss": 8.3681,
|
|
"mean_token_accuracy": 0.03765994198620319,
|
|
"num_tokens": 277736.0,
|
|
"step": 120
|
|
},
|
|
{
|
|
"entropy": 9.827960968017578,
|
|
"epoch": 0.012007684918347743,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 6.2e-05,
|
|
"loss": 8.2429,
|
|
"mean_token_accuracy": 0.035723325610160825,
|
|
"num_tokens": 289069.0,
|
|
"step": 125
|
|
},
|
|
{
|
|
"entropy": 9.59237585067749,
|
|
"epoch": 0.012487992315081652,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 6.450000000000001e-05,
|
|
"loss": 8.0891,
|
|
"mean_token_accuracy": 0.04738196656107903,
|
|
"num_tokens": 300240.0,
|
|
"step": 130
|
|
},
|
|
{
|
|
"entropy": 9.368733978271484,
|
|
"epoch": 0.012968299711815562,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 6.7e-05,
|
|
"loss": 8.0332,
|
|
"mean_token_accuracy": 0.04018798861652613,
|
|
"num_tokens": 311698.0,
|
|
"step": 135
|
|
},
|
|
{
|
|
"entropy": 9.110132884979247,
|
|
"epoch": 0.013448607108549471,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 6.950000000000001e-05,
|
|
"loss": 7.9056,
|
|
"mean_token_accuracy": 0.0432288508862257,
|
|
"num_tokens": 322844.0,
|
|
"step": 140
|
|
},
|
|
{
|
|
"entropy": 8.820003223419189,
|
|
"epoch": 0.013928914505283382,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 7.2e-05,
|
|
"loss": 7.8235,
|
|
"mean_token_accuracy": 0.045638217404484746,
|
|
"num_tokens": 335092.0,
|
|
"step": 145
|
|
},
|
|
{
|
|
"entropy": 8.585826587677001,
|
|
"epoch": 0.01440922190201729,
|
|
"grad_norm": 0.8359375,
|
|
"learning_rate": 7.45e-05,
|
|
"loss": 7.7332,
|
|
"mean_token_accuracy": 0.04667803719639778,
|
|
"num_tokens": 347033.0,
|
|
"step": 150
|
|
},
|
|
{
|
|
"entropy": 8.385289859771728,
|
|
"epoch": 0.014889529298751201,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 7.7e-05,
|
|
"loss": 7.6524,
|
|
"mean_token_accuracy": 0.05755673125386238,
|
|
"num_tokens": 358696.0,
|
|
"step": 155
|
|
},
|
|
{
|
|
"entropy": 8.231111812591553,
|
|
"epoch": 0.01536983669548511,
|
|
"grad_norm": 0.875,
|
|
"learning_rate": 7.950000000000001e-05,
|
|
"loss": 7.6369,
|
|
"mean_token_accuracy": 0.05747554413974285,
|
|
"num_tokens": 369390.0,
|
|
"step": 160
|
|
},
|
|
{
|
|
"entropy": 8.13049030303955,
|
|
"epoch": 0.01585014409221902,
|
|
"grad_norm": 0.921875,
|
|
"learning_rate": 8.2e-05,
|
|
"loss": 7.573,
|
|
"mean_token_accuracy": 0.058345531672239305,
|
|
"num_tokens": 380540.0,
|
|
"step": 165
|
|
},
|
|
{
|
|
"entropy": 8.037137985229492,
|
|
"epoch": 0.01633045148895293,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 8.450000000000001e-05,
|
|
"loss": 7.5672,
|
|
"mean_token_accuracy": 0.05862935781478882,
|
|
"num_tokens": 391243.0,
|
|
"step": 170
|
|
},
|
|
{
|
|
"entropy": 7.971378183364868,
|
|
"epoch": 0.01681075888568684,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 8.7e-05,
|
|
"loss": 7.5403,
|
|
"mean_token_accuracy": 0.06493047513067722,
|
|
"num_tokens": 403336.0,
|
|
"step": 175
|
|
},
|
|
{
|
|
"entropy": 7.996695470809937,
|
|
"epoch": 0.01729106628242075,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 8.95e-05,
|
|
"loss": 7.4714,
|
|
"mean_token_accuracy": 0.06883232817053794,
|
|
"num_tokens": 413886.0,
|
|
"step": 180
|
|
},
|
|
{
|
|
"entropy": 7.944087362289428,
|
|
"epoch": 0.01777137367915466,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 9.2e-05,
|
|
"loss": 7.5072,
|
|
"mean_token_accuracy": 0.07003857865929604,
|
|
"num_tokens": 425277.0,
|
|
"step": 185
|
|
},
|
|
{
|
|
"entropy": 7.903090763092041,
|
|
"epoch": 0.01825168107588857,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 9.45e-05,
|
|
"loss": 7.5901,
|
|
"mean_token_accuracy": 0.07094852812588215,
|
|
"num_tokens": 436868.0,
|
|
"step": 190
|
|
},
|
|
{
|
|
"entropy": 7.9524956226348875,
|
|
"epoch": 0.018731988472622477,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 9.7e-05,
|
|
"loss": 7.3956,
|
|
"mean_token_accuracy": 0.0713607795536518,
|
|
"num_tokens": 448349.0,
|
|
"step": 195
|
|
},
|
|
{
|
|
"entropy": 7.893163013458252,
|
|
"epoch": 0.01921229586935639,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 9.95e-05,
|
|
"loss": 7.398,
|
|
"mean_token_accuracy": 0.07450502514839172,
|
|
"num_tokens": 459447.0,
|
|
"step": 200
|
|
},
|
|
{
|
|
"entropy": 7.827638578414917,
|
|
"epoch": 0.0196926032660903,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000102,
|
|
"loss": 7.3545,
|
|
"mean_token_accuracy": 0.07836289256811142,
|
|
"num_tokens": 470734.0,
|
|
"step": 205
|
|
},
|
|
{
|
|
"entropy": 7.920483875274658,
|
|
"epoch": 0.020172910662824207,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00010449999999999999,
|
|
"loss": 7.3929,
|
|
"mean_token_accuracy": 0.07436848841607571,
|
|
"num_tokens": 482015.0,
|
|
"step": 210
|
|
},
|
|
{
|
|
"entropy": 7.829608154296875,
|
|
"epoch": 0.020653218059558116,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000107,
|
|
"loss": 7.3388,
|
|
"mean_token_accuracy": 0.0812894694507122,
|
|
"num_tokens": 493339.0,
|
|
"step": 215
|
|
},
|
|
{
|
|
"entropy": 7.832039451599121,
|
|
"epoch": 0.021133525456292025,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0001095,
|
|
"loss": 7.2806,
|
|
"mean_token_accuracy": 0.08215347118675709,
|
|
"num_tokens": 504924.0,
|
|
"step": 220
|
|
},
|
|
{
|
|
"entropy": 7.841120386123658,
|
|
"epoch": 0.021613832853025938,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.000112,
|
|
"loss": 7.2586,
|
|
"mean_token_accuracy": 0.07783420942723751,
|
|
"num_tokens": 516603.0,
|
|
"step": 225
|
|
},
|
|
{
|
|
"entropy": 7.667848110198975,
|
|
"epoch": 0.022094140249759846,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0001145,
|
|
"loss": 7.1767,
|
|
"mean_token_accuracy": 0.0903685748577118,
|
|
"num_tokens": 528347.0,
|
|
"step": 230
|
|
},
|
|
{
|
|
"entropy": 7.665532779693604,
|
|
"epoch": 0.022574447646493755,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.00011700000000000001,
|
|
"loss": 7.2657,
|
|
"mean_token_accuracy": 0.08881851136684418,
|
|
"num_tokens": 539328.0,
|
|
"step": 235
|
|
},
|
|
{
|
|
"entropy": 7.787159252166748,
|
|
"epoch": 0.023054755043227664,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.00011949999999999999,
|
|
"loss": 7.2264,
|
|
"mean_token_accuracy": 0.09179538786411286,
|
|
"num_tokens": 549297.0,
|
|
"step": 240
|
|
},
|
|
{
|
|
"entropy": 7.68054313659668,
|
|
"epoch": 0.023535062439961577,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.000122,
|
|
"loss": 7.1925,
|
|
"mean_token_accuracy": 0.0870781309902668,
|
|
"num_tokens": 560306.0,
|
|
"step": 245
|
|
},
|
|
{
|
|
"entropy": 7.722461795806884,
|
|
"epoch": 0.024015369836695485,
|
|
"grad_norm": 3.09375,
|
|
"learning_rate": 0.0001245,
|
|
"loss": 7.2601,
|
|
"mean_token_accuracy": 0.08716249391436577,
|
|
"num_tokens": 571972.0,
|
|
"step": 250
|
|
},
|
|
{
|
|
"entropy": 7.669500827789307,
|
|
"epoch": 0.024495677233429394,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000127,
|
|
"loss": 7.1479,
|
|
"mean_token_accuracy": 0.09271593019366264,
|
|
"num_tokens": 582962.0,
|
|
"step": 255
|
|
},
|
|
{
|
|
"entropy": 7.6647216796875,
|
|
"epoch": 0.024975984630163303,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 0.0001295,
|
|
"loss": 7.1214,
|
|
"mean_token_accuracy": 0.09072922170162201,
|
|
"num_tokens": 597193.0,
|
|
"step": 260
|
|
},
|
|
{
|
|
"entropy": 7.66283483505249,
|
|
"epoch": 0.025456292026897216,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.000132,
|
|
"loss": 7.1819,
|
|
"mean_token_accuracy": 0.09304547160863877,
|
|
"num_tokens": 608982.0,
|
|
"step": 265
|
|
},
|
|
{
|
|
"entropy": 7.661752843856812,
|
|
"epoch": 0.025936599423631124,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00013450000000000002,
|
|
"loss": 7.2188,
|
|
"mean_token_accuracy": 0.08966975659132004,
|
|
"num_tokens": 619953.0,
|
|
"step": 270
|
|
},
|
|
{
|
|
"entropy": 7.643835210800171,
|
|
"epoch": 0.026416906820365033,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00013700000000000002,
|
|
"loss": 7.1751,
|
|
"mean_token_accuracy": 0.09371341913938522,
|
|
"num_tokens": 631039.0,
|
|
"step": 275
|
|
},
|
|
{
|
|
"entropy": 7.632717418670654,
|
|
"epoch": 0.026897214217098942,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0001395,
|
|
"loss": 7.1656,
|
|
"mean_token_accuracy": 0.09481634944677353,
|
|
"num_tokens": 642656.0,
|
|
"step": 280
|
|
},
|
|
{
|
|
"entropy": 7.468483591079712,
|
|
"epoch": 0.027377521613832854,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.00014199999999999998,
|
|
"loss": 7.0285,
|
|
"mean_token_accuracy": 0.10727941244840622,
|
|
"num_tokens": 653748.0,
|
|
"step": 285
|
|
},
|
|
{
|
|
"entropy": 7.516920471191407,
|
|
"epoch": 0.027857829010566763,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0001445,
|
|
"loss": 7.0029,
|
|
"mean_token_accuracy": 0.09661566317081452,
|
|
"num_tokens": 665618.0,
|
|
"step": 290
|
|
},
|
|
{
|
|
"entropy": 7.486124277114868,
|
|
"epoch": 0.028338136407300672,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000147,
|
|
"loss": 7.0287,
|
|
"mean_token_accuracy": 0.09913064762949944,
|
|
"num_tokens": 677329.0,
|
|
"step": 295
|
|
},
|
|
{
|
|
"entropy": 7.49315767288208,
|
|
"epoch": 0.02881844380403458,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0001495,
|
|
"loss": 6.9864,
|
|
"mean_token_accuracy": 0.1033214770257473,
|
|
"num_tokens": 688278.0,
|
|
"step": 300
|
|
},
|
|
{
|
|
"entropy": 7.431641435623169,
|
|
"epoch": 0.029298751200768493,
|
|
"grad_norm": 1.96875,
|
|
"learning_rate": 0.000152,
|
|
"loss": 7.046,
|
|
"mean_token_accuracy": 0.10180941373109817,
|
|
"num_tokens": 700739.0,
|
|
"step": 305
|
|
},
|
|
{
|
|
"entropy": 7.378959465026855,
|
|
"epoch": 0.029779058597502402,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.00015450000000000001,
|
|
"loss": 6.9858,
|
|
"mean_token_accuracy": 0.104751455783844,
|
|
"num_tokens": 712527.0,
|
|
"step": 310
|
|
},
|
|
{
|
|
"entropy": 7.4179362773895265,
|
|
"epoch": 0.03025936599423631,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.000157,
|
|
"loss": 7.0113,
|
|
"mean_token_accuracy": 0.09946026802062988,
|
|
"num_tokens": 724514.0,
|
|
"step": 315
|
|
},
|
|
{
|
|
"entropy": 7.464642429351807,
|
|
"epoch": 0.03073967339097022,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0001595,
|
|
"loss": 6.958,
|
|
"mean_token_accuracy": 0.10636739879846573,
|
|
"num_tokens": 735679.0,
|
|
"step": 320
|
|
},
|
|
{
|
|
"entropy": 7.379268789291382,
|
|
"epoch": 0.03121998078770413,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.000162,
|
|
"loss": 6.9502,
|
|
"mean_token_accuracy": 0.10707954466342925,
|
|
"num_tokens": 747896.0,
|
|
"step": 325
|
|
},
|
|
{
|
|
"entropy": 7.4328147888183596,
|
|
"epoch": 0.03170028818443804,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00016450000000000001,
|
|
"loss": 7.0008,
|
|
"mean_token_accuracy": 0.10451544597744941,
|
|
"num_tokens": 759081.0,
|
|
"step": 330
|
|
},
|
|
{
|
|
"entropy": 7.373377466201783,
|
|
"epoch": 0.03218059558117195,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.00016700000000000002,
|
|
"loss": 6.9349,
|
|
"mean_token_accuracy": 0.10051383301615716,
|
|
"num_tokens": 770459.0,
|
|
"step": 335
|
|
},
|
|
{
|
|
"entropy": 7.3182484149932865,
|
|
"epoch": 0.03266090297790586,
|
|
"grad_norm": 2.25,
|
|
"learning_rate": 0.00016950000000000003,
|
|
"loss": 6.9097,
|
|
"mean_token_accuracy": 0.10436427593231201,
|
|
"num_tokens": 783960.0,
|
|
"step": 340
|
|
},
|
|
{
|
|
"entropy": 7.2723020076751705,
|
|
"epoch": 0.03314121037463977,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00017199999999999998,
|
|
"loss": 6.9998,
|
|
"mean_token_accuracy": 0.1017355315387249,
|
|
"num_tokens": 795425.0,
|
|
"step": 345
|
|
},
|
|
{
|
|
"entropy": 7.288401937484741,
|
|
"epoch": 0.03362151777137368,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.00017449999999999999,
|
|
"loss": 6.9466,
|
|
"mean_token_accuracy": 0.1032905712723732,
|
|
"num_tokens": 807536.0,
|
|
"step": 350
|
|
},
|
|
{
|
|
"entropy": 7.429675006866455,
|
|
"epoch": 0.034101825168107586,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.000177,
|
|
"loss": 6.9955,
|
|
"mean_token_accuracy": 0.09869879111647606,
|
|
"num_tokens": 818801.0,
|
|
"step": 355
|
|
},
|
|
{
|
|
"entropy": 7.303883075714111,
|
|
"epoch": 0.0345821325648415,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0001795,
|
|
"loss": 6.8664,
|
|
"mean_token_accuracy": 0.1042160525918007,
|
|
"num_tokens": 831497.0,
|
|
"step": 360
|
|
},
|
|
{
|
|
"entropy": 7.275684547424317,
|
|
"epoch": 0.03506243996157541,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.000182,
|
|
"loss": 6.8349,
|
|
"mean_token_accuracy": 0.10631057769060134,
|
|
"num_tokens": 842491.0,
|
|
"step": 365
|
|
},
|
|
{
|
|
"entropy": 7.303065443038941,
|
|
"epoch": 0.03554274735830932,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0001845,
|
|
"loss": 6.9059,
|
|
"mean_token_accuracy": 0.09917943850159645,
|
|
"num_tokens": 854560.0,
|
|
"step": 370
|
|
},
|
|
{
|
|
"entropy": 7.275861215591431,
|
|
"epoch": 0.03602305475504323,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.000187,
|
|
"loss": 6.8151,
|
|
"mean_token_accuracy": 0.11120132729411125,
|
|
"num_tokens": 866688.0,
|
|
"step": 375
|
|
},
|
|
{
|
|
"entropy": 7.233143997192383,
|
|
"epoch": 0.03650336215177714,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0001895,
|
|
"loss": 6.9205,
|
|
"mean_token_accuracy": 0.09971508085727691,
|
|
"num_tokens": 879484.0,
|
|
"step": 380
|
|
},
|
|
{
|
|
"entropy": 7.290747499465942,
|
|
"epoch": 0.036983669548511046,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.000192,
|
|
"loss": 6.9039,
|
|
"mean_token_accuracy": 0.10731675177812576,
|
|
"num_tokens": 890807.0,
|
|
"step": 385
|
|
},
|
|
{
|
|
"entropy": 7.2609399318695065,
|
|
"epoch": 0.037463976945244955,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0001945,
|
|
"loss": 6.854,
|
|
"mean_token_accuracy": 0.10835549905896187,
|
|
"num_tokens": 901759.0,
|
|
"step": 390
|
|
},
|
|
{
|
|
"entropy": 7.174216985702515,
|
|
"epoch": 0.037944284341978864,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00019700000000000002,
|
|
"loss": 6.7707,
|
|
"mean_token_accuracy": 0.1162538155913353,
|
|
"num_tokens": 912212.0,
|
|
"step": 395
|
|
},
|
|
{
|
|
"entropy": 7.264402294158936,
|
|
"epoch": 0.03842459173871278,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00019950000000000002,
|
|
"loss": 6.8764,
|
|
"mean_token_accuracy": 0.10775518119335174,
|
|
"num_tokens": 923947.0,
|
|
"step": 400
|
|
},
|
|
{
|
|
"entropy": 7.194364166259765,
|
|
"epoch": 0.03890489913544669,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.000202,
|
|
"loss": 6.8149,
|
|
"mean_token_accuracy": 0.1155998706817627,
|
|
"num_tokens": 935732.0,
|
|
"step": 405
|
|
},
|
|
{
|
|
"entropy": 7.094007158279419,
|
|
"epoch": 0.0393852065321806,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.00020449999999999998,
|
|
"loss": 6.7534,
|
|
"mean_token_accuracy": 0.11219719424843788,
|
|
"num_tokens": 948261.0,
|
|
"step": 410
|
|
},
|
|
{
|
|
"entropy": 7.198687505722046,
|
|
"epoch": 0.039865513928914506,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.000207,
|
|
"loss": 6.8682,
|
|
"mean_token_accuracy": 0.11036199703812599,
|
|
"num_tokens": 959574.0,
|
|
"step": 415
|
|
},
|
|
{
|
|
"entropy": 7.14764518737793,
|
|
"epoch": 0.040345821325648415,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0002095,
|
|
"loss": 6.9302,
|
|
"mean_token_accuracy": 0.10567210242152214,
|
|
"num_tokens": 970329.0,
|
|
"step": 420
|
|
},
|
|
{
|
|
"entropy": 7.284962558746338,
|
|
"epoch": 0.040826128722382324,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.000212,
|
|
"loss": 6.7852,
|
|
"mean_token_accuracy": 0.11808342635631561,
|
|
"num_tokens": 982037.0,
|
|
"step": 425
|
|
},
|
|
{
|
|
"entropy": 6.99963059425354,
|
|
"epoch": 0.04130643611911623,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0002145,
|
|
"loss": 6.7507,
|
|
"mean_token_accuracy": 0.1121592566370964,
|
|
"num_tokens": 994612.0,
|
|
"step": 430
|
|
},
|
|
{
|
|
"entropy": 7.1772722721099855,
|
|
"epoch": 0.04178674351585014,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00021700000000000002,
|
|
"loss": 6.8563,
|
|
"mean_token_accuracy": 0.11890432462096215,
|
|
"num_tokens": 1005960.0,
|
|
"step": 435
|
|
},
|
|
{
|
|
"entropy": 7.119032526016236,
|
|
"epoch": 0.04226705091258405,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0002195,
|
|
"loss": 6.726,
|
|
"mean_token_accuracy": 0.11254842653870582,
|
|
"num_tokens": 1017618.0,
|
|
"step": 440
|
|
},
|
|
{
|
|
"entropy": 7.120699787139893,
|
|
"epoch": 0.042747358309317966,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.000222,
|
|
"loss": 6.7617,
|
|
"mean_token_accuracy": 0.11123086810112,
|
|
"num_tokens": 1029307.0,
|
|
"step": 445
|
|
},
|
|
{
|
|
"entropy": 7.10453462600708,
|
|
"epoch": 0.043227665706051875,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0002245,
|
|
"loss": 6.7794,
|
|
"mean_token_accuracy": 0.11213452070951462,
|
|
"num_tokens": 1042027.0,
|
|
"step": 450
|
|
},
|
|
{
|
|
"entropy": 7.109935092926025,
|
|
"epoch": 0.043707973102785784,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00022700000000000002,
|
|
"loss": 6.7726,
|
|
"mean_token_accuracy": 0.11005142331123352,
|
|
"num_tokens": 1053125.0,
|
|
"step": 455
|
|
},
|
|
{
|
|
"entropy": 7.093224906921387,
|
|
"epoch": 0.04418828049951969,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.00022950000000000002,
|
|
"loss": 6.7646,
|
|
"mean_token_accuracy": 0.11863623559474945,
|
|
"num_tokens": 1064908.0,
|
|
"step": 460
|
|
},
|
|
{
|
|
"entropy": 7.0393500328063965,
|
|
"epoch": 0.0446685878962536,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00023200000000000003,
|
|
"loss": 6.6415,
|
|
"mean_token_accuracy": 0.12022090703248978,
|
|
"num_tokens": 1076328.0,
|
|
"step": 465
|
|
},
|
|
{
|
|
"entropy": 7.159615230560303,
|
|
"epoch": 0.04514889529298751,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00023449999999999998,
|
|
"loss": 6.8668,
|
|
"mean_token_accuracy": 0.10638144612312317,
|
|
"num_tokens": 1088469.0,
|
|
"step": 470
|
|
},
|
|
{
|
|
"entropy": 6.9358738422393795,
|
|
"epoch": 0.04562920268972142,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.000237,
|
|
"loss": 6.6608,
|
|
"mean_token_accuracy": 0.11796007007360458,
|
|
"num_tokens": 1099408.0,
|
|
"step": 475
|
|
},
|
|
{
|
|
"entropy": 6.921041584014892,
|
|
"epoch": 0.04610951008645533,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0002395,
|
|
"loss": 6.596,
|
|
"mean_token_accuracy": 0.12084084451198578,
|
|
"num_tokens": 1111101.0,
|
|
"step": 480
|
|
},
|
|
{
|
|
"entropy": 6.980242967605591,
|
|
"epoch": 0.046589817483189244,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.000242,
|
|
"loss": 6.6189,
|
|
"mean_token_accuracy": 0.11961494460701942,
|
|
"num_tokens": 1122877.0,
|
|
"step": 485
|
|
},
|
|
{
|
|
"entropy": 6.998215103149414,
|
|
"epoch": 0.04707012487992315,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0002445,
|
|
"loss": 6.7183,
|
|
"mean_token_accuracy": 0.1069619596004486,
|
|
"num_tokens": 1133956.0,
|
|
"step": 490
|
|
},
|
|
{
|
|
"entropy": 6.955817556381225,
|
|
"epoch": 0.04755043227665706,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.000247,
|
|
"loss": 6.6106,
|
|
"mean_token_accuracy": 0.12115221694111825,
|
|
"num_tokens": 1146101.0,
|
|
"step": 495
|
|
},
|
|
{
|
|
"entropy": 6.991823005676269,
|
|
"epoch": 0.04803073967339097,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0002495,
|
|
"loss": 6.704,
|
|
"mean_token_accuracy": 0.1240153320133686,
|
|
"num_tokens": 1157432.0,
|
|
"step": 500
|
|
},
|
|
{
|
|
"entropy": 6.995119285583496,
|
|
"epoch": 0.04851104707012488,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.000252,
|
|
"loss": 6.6931,
|
|
"mean_token_accuracy": 0.12121785953640937,
|
|
"num_tokens": 1167601.0,
|
|
"step": 505
|
|
},
|
|
{
|
|
"entropy": 6.925166416168213,
|
|
"epoch": 0.04899135446685879,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0002545,
|
|
"loss": 6.5948,
|
|
"mean_token_accuracy": 0.11933866590261459,
|
|
"num_tokens": 1178818.0,
|
|
"step": 510
|
|
},
|
|
{
|
|
"entropy": 7.102405261993408,
|
|
"epoch": 0.0494716618635927,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.000257,
|
|
"loss": 6.8296,
|
|
"mean_token_accuracy": 0.11879347264766693,
|
|
"num_tokens": 1189977.0,
|
|
"step": 515
|
|
},
|
|
{
|
|
"entropy": 6.896050024032593,
|
|
"epoch": 0.049951969260326606,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0002595,
|
|
"loss": 6.6543,
|
|
"mean_token_accuracy": 0.12233106046915054,
|
|
"num_tokens": 1201039.0,
|
|
"step": 520
|
|
},
|
|
{
|
|
"entropy": 7.007365083694458,
|
|
"epoch": 0.05043227665706052,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.000262,
|
|
"loss": 6.6791,
|
|
"mean_token_accuracy": 0.12215208187699318,
|
|
"num_tokens": 1212573.0,
|
|
"step": 525
|
|
},
|
|
{
|
|
"entropy": 7.002063369750976,
|
|
"epoch": 0.05091258405379443,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00026450000000000003,
|
|
"loss": 6.6208,
|
|
"mean_token_accuracy": 0.1271028608083725,
|
|
"num_tokens": 1223382.0,
|
|
"step": 530
|
|
},
|
|
{
|
|
"entropy": 6.9438478469848635,
|
|
"epoch": 0.05139289145052834,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00026700000000000004,
|
|
"loss": 6.6969,
|
|
"mean_token_accuracy": 0.12958464100956918,
|
|
"num_tokens": 1236501.0,
|
|
"step": 535
|
|
},
|
|
{
|
|
"entropy": 6.931712675094604,
|
|
"epoch": 0.05187319884726225,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00026950000000000005,
|
|
"loss": 6.687,
|
|
"mean_token_accuracy": 0.12256318107247352,
|
|
"num_tokens": 1246798.0,
|
|
"step": 540
|
|
},
|
|
{
|
|
"entropy": 6.9002622127532955,
|
|
"epoch": 0.05235350624399616,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.00027200000000000005,
|
|
"loss": 6.6164,
|
|
"mean_token_accuracy": 0.12228193208575248,
|
|
"num_tokens": 1258182.0,
|
|
"step": 545
|
|
},
|
|
{
|
|
"entropy": 6.873838090896607,
|
|
"epoch": 0.052833813640730067,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0002745,
|
|
"loss": 6.5781,
|
|
"mean_token_accuracy": 0.11714496314525605,
|
|
"num_tokens": 1270273.0,
|
|
"step": 550
|
|
},
|
|
{
|
|
"entropy": 6.869143629074097,
|
|
"epoch": 0.053314121037463975,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.000277,
|
|
"loss": 6.6336,
|
|
"mean_token_accuracy": 0.11991709843277931,
|
|
"num_tokens": 1281136.0,
|
|
"step": 555
|
|
},
|
|
{
|
|
"entropy": 6.914445209503174,
|
|
"epoch": 0.053794428434197884,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0002795,
|
|
"loss": 6.6257,
|
|
"mean_token_accuracy": 0.12010404467582703,
|
|
"num_tokens": 1294488.0,
|
|
"step": 560
|
|
},
|
|
{
|
|
"entropy": 6.732436418533325,
|
|
"epoch": 0.05427473583093179,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00028199999999999997,
|
|
"loss": 6.5262,
|
|
"mean_token_accuracy": 0.12693093419075013,
|
|
"num_tokens": 1304113.0,
|
|
"step": 565
|
|
},
|
|
{
|
|
"entropy": 6.927071809768677,
|
|
"epoch": 0.05475504322766571,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0002845,
|
|
"loss": 6.5843,
|
|
"mean_token_accuracy": 0.12877818644046785,
|
|
"num_tokens": 1315417.0,
|
|
"step": 570
|
|
},
|
|
{
|
|
"entropy": 6.783261919021607,
|
|
"epoch": 0.05523535062439962,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.000287,
|
|
"loss": 6.5521,
|
|
"mean_token_accuracy": 0.1234595388174057,
|
|
"num_tokens": 1328084.0,
|
|
"step": 575
|
|
},
|
|
{
|
|
"entropy": 6.8645414352417,
|
|
"epoch": 0.05571565802113353,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0002895,
|
|
"loss": 6.6982,
|
|
"mean_token_accuracy": 0.1229254849255085,
|
|
"num_tokens": 1338696.0,
|
|
"step": 580
|
|
},
|
|
{
|
|
"entropy": 6.887264966964722,
|
|
"epoch": 0.056195965417867436,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.000292,
|
|
"loss": 6.6333,
|
|
"mean_token_accuracy": 0.12206205278635025,
|
|
"num_tokens": 1350240.0,
|
|
"step": 585
|
|
},
|
|
{
|
|
"entropy": 6.901881551742553,
|
|
"epoch": 0.056676272814601344,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0002945,
|
|
"loss": 6.5792,
|
|
"mean_token_accuracy": 0.12374859303236008,
|
|
"num_tokens": 1361720.0,
|
|
"step": 590
|
|
},
|
|
{
|
|
"entropy": 6.646714115142823,
|
|
"epoch": 0.05715658021133525,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.000297,
|
|
"loss": 6.5831,
|
|
"mean_token_accuracy": 0.12852583453059196,
|
|
"num_tokens": 1373286.0,
|
|
"step": 595
|
|
},
|
|
{
|
|
"entropy": 6.89121675491333,
|
|
"epoch": 0.05763688760806916,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0002995,
|
|
"loss": 6.5332,
|
|
"mean_token_accuracy": 0.12378557696938515,
|
|
"num_tokens": 1384274.0,
|
|
"step": 600
|
|
},
|
|
{
|
|
"entropy": 6.707057476043701,
|
|
"epoch": 0.05811719500480307,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.000302,
|
|
"loss": 6.5674,
|
|
"mean_token_accuracy": 0.1248041570186615,
|
|
"num_tokens": 1395355.0,
|
|
"step": 605
|
|
},
|
|
{
|
|
"entropy": 6.787681436538696,
|
|
"epoch": 0.05859750240153699,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0003045,
|
|
"loss": 6.5071,
|
|
"mean_token_accuracy": 0.1337241604924202,
|
|
"num_tokens": 1406664.0,
|
|
"step": 610
|
|
},
|
|
{
|
|
"entropy": 6.907395648956299,
|
|
"epoch": 0.059077809798270896,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.000307,
|
|
"loss": 6.6562,
|
|
"mean_token_accuracy": 0.12113718539476395,
|
|
"num_tokens": 1418450.0,
|
|
"step": 615
|
|
},
|
|
{
|
|
"entropy": 6.8045419216156,
|
|
"epoch": 0.059558117195004805,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0003095,
|
|
"loss": 6.5466,
|
|
"mean_token_accuracy": 0.12454390972852707,
|
|
"num_tokens": 1430048.0,
|
|
"step": 620
|
|
},
|
|
{
|
|
"entropy": 6.808126592636109,
|
|
"epoch": 0.060038424591738714,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.000312,
|
|
"loss": 6.5911,
|
|
"mean_token_accuracy": 0.12378140687942504,
|
|
"num_tokens": 1441820.0,
|
|
"step": 625
|
|
},
|
|
{
|
|
"entropy": 6.753187370300293,
|
|
"epoch": 0.06051873198847262,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0003145,
|
|
"loss": 6.445,
|
|
"mean_token_accuracy": 0.13010460510849953,
|
|
"num_tokens": 1453209.0,
|
|
"step": 630
|
|
},
|
|
{
|
|
"entropy": 6.6527941703796385,
|
|
"epoch": 0.06099903938520653,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.000317,
|
|
"loss": 6.4598,
|
|
"mean_token_accuracy": 0.12725651860237122,
|
|
"num_tokens": 1465423.0,
|
|
"step": 635
|
|
},
|
|
{
|
|
"entropy": 6.711978006362915,
|
|
"epoch": 0.06147934678194044,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0003195,
|
|
"loss": 6.4541,
|
|
"mean_token_accuracy": 0.13069155365228652,
|
|
"num_tokens": 1476575.0,
|
|
"step": 640
|
|
},
|
|
{
|
|
"entropy": 6.659121417999268,
|
|
"epoch": 0.06195965417867435,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.000322,
|
|
"loss": 6.4109,
|
|
"mean_token_accuracy": 0.12579366862773894,
|
|
"num_tokens": 1486932.0,
|
|
"step": 645
|
|
},
|
|
{
|
|
"entropy": 6.691300868988037,
|
|
"epoch": 0.06243996157540826,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00032450000000000003,
|
|
"loss": 6.4399,
|
|
"mean_token_accuracy": 0.12854820042848586,
|
|
"num_tokens": 1498494.0,
|
|
"step": 650
|
|
},
|
|
{
|
|
"entropy": 6.7037928104400635,
|
|
"epoch": 0.06292026897214217,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00032700000000000003,
|
|
"loss": 6.4936,
|
|
"mean_token_accuracy": 0.12374913021922111,
|
|
"num_tokens": 1509937.0,
|
|
"step": 655
|
|
},
|
|
{
|
|
"entropy": 6.782931184768676,
|
|
"epoch": 0.06340057636887608,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00032950000000000004,
|
|
"loss": 6.5147,
|
|
"mean_token_accuracy": 0.13380258977413179,
|
|
"num_tokens": 1519823.0,
|
|
"step": 660
|
|
},
|
|
{
|
|
"entropy": 6.726450872421265,
|
|
"epoch": 0.06388088376560999,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00033200000000000005,
|
|
"loss": 6.5528,
|
|
"mean_token_accuracy": 0.12575417309999465,
|
|
"num_tokens": 1529943.0,
|
|
"step": 665
|
|
},
|
|
{
|
|
"entropy": 6.611954069137573,
|
|
"epoch": 0.0643611911623439,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00033450000000000005,
|
|
"loss": 6.3767,
|
|
"mean_token_accuracy": 0.13369367122650147,
|
|
"num_tokens": 1540618.0,
|
|
"step": 670
|
|
},
|
|
{
|
|
"entropy": 6.685780334472656,
|
|
"epoch": 0.06484149855907781,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.000337,
|
|
"loss": 6.5048,
|
|
"mean_token_accuracy": 0.1227756217122078,
|
|
"num_tokens": 1553208.0,
|
|
"step": 675
|
|
},
|
|
{
|
|
"entropy": 6.6764894962310795,
|
|
"epoch": 0.06532180595581172,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0003395,
|
|
"loss": 6.4589,
|
|
"mean_token_accuracy": 0.1339925467967987,
|
|
"num_tokens": 1563975.0,
|
|
"step": 680
|
|
},
|
|
{
|
|
"entropy": 6.717716455459595,
|
|
"epoch": 0.06580211335254563,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.000342,
|
|
"loss": 6.5252,
|
|
"mean_token_accuracy": 0.12458744868636132,
|
|
"num_tokens": 1575998.0,
|
|
"step": 685
|
|
},
|
|
{
|
|
"entropy": 6.6251349449157715,
|
|
"epoch": 0.06628242074927954,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00034449999999999997,
|
|
"loss": 6.3994,
|
|
"mean_token_accuracy": 0.13568611592054367,
|
|
"num_tokens": 1586041.0,
|
|
"step": 690
|
|
},
|
|
{
|
|
"entropy": 6.637330770492554,
|
|
"epoch": 0.06676272814601344,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.000347,
|
|
"loss": 6.4796,
|
|
"mean_token_accuracy": 0.12872253656387328,
|
|
"num_tokens": 1597531.0,
|
|
"step": 695
|
|
},
|
|
{
|
|
"entropy": 6.617096710205078,
|
|
"epoch": 0.06724303554274735,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0003495,
|
|
"loss": 6.4549,
|
|
"mean_token_accuracy": 0.12859696000814438,
|
|
"num_tokens": 1609255.0,
|
|
"step": 700
|
|
},
|
|
{
|
|
"entropy": 6.640483236312866,
|
|
"epoch": 0.06772334293948126,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.000352,
|
|
"loss": 6.439,
|
|
"mean_token_accuracy": 0.13394341096282006,
|
|
"num_tokens": 1621098.0,
|
|
"step": 705
|
|
},
|
|
{
|
|
"entropy": 6.601499080657959,
|
|
"epoch": 0.06820365033621517,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0003545,
|
|
"loss": 6.3504,
|
|
"mean_token_accuracy": 0.14078185856342315,
|
|
"num_tokens": 1631941.0,
|
|
"step": 710
|
|
},
|
|
{
|
|
"entropy": 6.551211166381836,
|
|
"epoch": 0.0686839577329491,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000357,
|
|
"loss": 6.3471,
|
|
"mean_token_accuracy": 0.13648251742124556,
|
|
"num_tokens": 1643117.0,
|
|
"step": 715
|
|
},
|
|
{
|
|
"entropy": 6.5161905765533445,
|
|
"epoch": 0.069164265129683,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0003595,
|
|
"loss": 6.3952,
|
|
"mean_token_accuracy": 0.13429828062653543,
|
|
"num_tokens": 1653595.0,
|
|
"step": 720
|
|
},
|
|
{
|
|
"entropy": 6.614610481262207,
|
|
"epoch": 0.06964457252641691,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.000362,
|
|
"loss": 6.4168,
|
|
"mean_token_accuracy": 0.13274685442447662,
|
|
"num_tokens": 1664495.0,
|
|
"step": 725
|
|
},
|
|
{
|
|
"entropy": 6.5094832420349125,
|
|
"epoch": 0.07012487992315082,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0003645,
|
|
"loss": 6.4047,
|
|
"mean_token_accuracy": 0.136563728004694,
|
|
"num_tokens": 1674923.0,
|
|
"step": 730
|
|
},
|
|
{
|
|
"entropy": 6.602942371368409,
|
|
"epoch": 0.07060518731988473,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.000367,
|
|
"loss": 6.3045,
|
|
"mean_token_accuracy": 0.13681301474571228,
|
|
"num_tokens": 1685904.0,
|
|
"step": 735
|
|
},
|
|
{
|
|
"entropy": 6.596617603302002,
|
|
"epoch": 0.07108549471661864,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003695,
|
|
"loss": 6.5324,
|
|
"mean_token_accuracy": 0.12432878389954567,
|
|
"num_tokens": 1699133.0,
|
|
"step": 740
|
|
},
|
|
{
|
|
"entropy": 6.504991292953491,
|
|
"epoch": 0.07156580211335255,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.000372,
|
|
"loss": 6.342,
|
|
"mean_token_accuracy": 0.13271907046437265,
|
|
"num_tokens": 1711559.0,
|
|
"step": 745
|
|
},
|
|
{
|
|
"entropy": 6.592547464370727,
|
|
"epoch": 0.07204610951008646,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0003745,
|
|
"loss": 6.2575,
|
|
"mean_token_accuracy": 0.14460937380790712,
|
|
"num_tokens": 1722526.0,
|
|
"step": 750
|
|
},
|
|
{
|
|
"entropy": 6.4313709259033205,
|
|
"epoch": 0.07252641690682037,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000377,
|
|
"loss": 6.3265,
|
|
"mean_token_accuracy": 0.1398925192654133,
|
|
"num_tokens": 1734261.0,
|
|
"step": 755
|
|
},
|
|
{
|
|
"entropy": 6.5256377220153805,
|
|
"epoch": 0.07300672430355427,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0003795,
|
|
"loss": 6.3105,
|
|
"mean_token_accuracy": 0.14366703033447265,
|
|
"num_tokens": 1745151.0,
|
|
"step": 760
|
|
},
|
|
{
|
|
"entropy": 6.631883907318115,
|
|
"epoch": 0.07348703170028818,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.000382,
|
|
"loss": 6.4547,
|
|
"mean_token_accuracy": 0.1341322012245655,
|
|
"num_tokens": 1755463.0,
|
|
"step": 765
|
|
},
|
|
{
|
|
"entropy": 6.584089756011963,
|
|
"epoch": 0.07396733909702209,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0003845,
|
|
"loss": 6.4178,
|
|
"mean_token_accuracy": 0.1315837398171425,
|
|
"num_tokens": 1767717.0,
|
|
"step": 770
|
|
},
|
|
{
|
|
"entropy": 6.3859930515289305,
|
|
"epoch": 0.074447646493756,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00038700000000000003,
|
|
"loss": 6.2619,
|
|
"mean_token_accuracy": 0.14160886630415917,
|
|
"num_tokens": 1779115.0,
|
|
"step": 775
|
|
},
|
|
{
|
|
"entropy": 6.3998737812042235,
|
|
"epoch": 0.07492795389048991,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00038950000000000003,
|
|
"loss": 6.213,
|
|
"mean_token_accuracy": 0.1398429863154888,
|
|
"num_tokens": 1789644.0,
|
|
"step": 780
|
|
},
|
|
{
|
|
"entropy": 6.540688323974609,
|
|
"epoch": 0.07540826128722382,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00039200000000000004,
|
|
"loss": 6.4251,
|
|
"mean_token_accuracy": 0.13578777611255646,
|
|
"num_tokens": 1800606.0,
|
|
"step": 785
|
|
},
|
|
{
|
|
"entropy": 6.513448238372803,
|
|
"epoch": 0.07588856868395773,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00039450000000000005,
|
|
"loss": 6.4264,
|
|
"mean_token_accuracy": 0.12942690253257752,
|
|
"num_tokens": 1812168.0,
|
|
"step": 790
|
|
},
|
|
{
|
|
"entropy": 6.5457319736480715,
|
|
"epoch": 0.07636887608069164,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00039700000000000005,
|
|
"loss": 6.3796,
|
|
"mean_token_accuracy": 0.1303087830543518,
|
|
"num_tokens": 1823830.0,
|
|
"step": 795
|
|
},
|
|
{
|
|
"entropy": 6.495282316207886,
|
|
"epoch": 0.07684918347742556,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0003995,
|
|
"loss": 6.3456,
|
|
"mean_token_accuracy": 0.13957973942160606,
|
|
"num_tokens": 1835611.0,
|
|
"step": 800
|
|
},
|
|
{
|
|
"entropy": 6.467644214630127,
|
|
"epoch": 0.07732949087415947,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000402,
|
|
"loss": 6.4127,
|
|
"mean_token_accuracy": 0.1334280975162983,
|
|
"num_tokens": 1847036.0,
|
|
"step": 805
|
|
},
|
|
{
|
|
"entropy": 6.464094591140747,
|
|
"epoch": 0.07780979827089338,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004045,
|
|
"loss": 6.3528,
|
|
"mean_token_accuracy": 0.13223012760281563,
|
|
"num_tokens": 1857476.0,
|
|
"step": 810
|
|
},
|
|
{
|
|
"entropy": 6.50727949142456,
|
|
"epoch": 0.07829010566762729,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00040699999999999997,
|
|
"loss": 6.3773,
|
|
"mean_token_accuracy": 0.1352442115545273,
|
|
"num_tokens": 1869073.0,
|
|
"step": 815
|
|
},
|
|
{
|
|
"entropy": 6.384515810012817,
|
|
"epoch": 0.0787704130643612,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004095,
|
|
"loss": 6.2486,
|
|
"mean_token_accuracy": 0.14026699736714363,
|
|
"num_tokens": 1880439.0,
|
|
"step": 820
|
|
},
|
|
{
|
|
"entropy": 6.561717510223389,
|
|
"epoch": 0.0792507204610951,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.000412,
|
|
"loss": 6.4116,
|
|
"mean_token_accuracy": 0.134783523529768,
|
|
"num_tokens": 1891600.0,
|
|
"step": 825
|
|
},
|
|
{
|
|
"entropy": 6.414502573013306,
|
|
"epoch": 0.07973102785782901,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004145,
|
|
"loss": 6.3783,
|
|
"mean_token_accuracy": 0.13531816452741624,
|
|
"num_tokens": 1903126.0,
|
|
"step": 830
|
|
},
|
|
{
|
|
"entropy": 6.5730548858642575,
|
|
"epoch": 0.08021133525456292,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.000417,
|
|
"loss": 6.3467,
|
|
"mean_token_accuracy": 0.14032403156161308,
|
|
"num_tokens": 1913913.0,
|
|
"step": 835
|
|
},
|
|
{
|
|
"entropy": 6.344644355773926,
|
|
"epoch": 0.08069164265129683,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004195,
|
|
"loss": 6.2684,
|
|
"mean_token_accuracy": 0.1382530964910984,
|
|
"num_tokens": 1924961.0,
|
|
"step": 840
|
|
},
|
|
{
|
|
"entropy": 6.523792457580567,
|
|
"epoch": 0.08117195004803074,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.000422,
|
|
"loss": 6.3612,
|
|
"mean_token_accuracy": 0.12942377403378486,
|
|
"num_tokens": 1936773.0,
|
|
"step": 845
|
|
},
|
|
{
|
|
"entropy": 6.355926513671875,
|
|
"epoch": 0.08165225744476465,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004245,
|
|
"loss": 6.2783,
|
|
"mean_token_accuracy": 0.13875910267233849,
|
|
"num_tokens": 1948190.0,
|
|
"step": 850
|
|
},
|
|
{
|
|
"entropy": 6.331581449508667,
|
|
"epoch": 0.08213256484149856,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000427,
|
|
"loss": 6.2694,
|
|
"mean_token_accuracy": 0.14160780385136604,
|
|
"num_tokens": 1960038.0,
|
|
"step": 855
|
|
},
|
|
{
|
|
"entropy": 6.557125091552734,
|
|
"epoch": 0.08261287223823247,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004295,
|
|
"loss": 6.3489,
|
|
"mean_token_accuracy": 0.14002878665924073,
|
|
"num_tokens": 1970535.0,
|
|
"step": 860
|
|
},
|
|
{
|
|
"entropy": 6.411432456970215,
|
|
"epoch": 0.08309317963496637,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.000432,
|
|
"loss": 6.3226,
|
|
"mean_token_accuracy": 0.13546231836080552,
|
|
"num_tokens": 1981386.0,
|
|
"step": 865
|
|
},
|
|
{
|
|
"entropy": 6.337710332870484,
|
|
"epoch": 0.08357348703170028,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004345,
|
|
"loss": 6.2428,
|
|
"mean_token_accuracy": 0.1426716774702072,
|
|
"num_tokens": 1993196.0,
|
|
"step": 870
|
|
},
|
|
{
|
|
"entropy": 6.432919025421143,
|
|
"epoch": 0.08405379442843419,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000437,
|
|
"loss": 6.2741,
|
|
"mean_token_accuracy": 0.14658503904938697,
|
|
"num_tokens": 2004756.0,
|
|
"step": 875
|
|
},
|
|
{
|
|
"entropy": 6.315603113174438,
|
|
"epoch": 0.0845341018251681,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004395,
|
|
"loss": 6.2347,
|
|
"mean_token_accuracy": 0.14145326390862464,
|
|
"num_tokens": 2016020.0,
|
|
"step": 880
|
|
},
|
|
{
|
|
"entropy": 6.380750274658203,
|
|
"epoch": 0.08501440922190202,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.000442,
|
|
"loss": 6.2819,
|
|
"mean_token_accuracy": 0.14082487300038338,
|
|
"num_tokens": 2027747.0,
|
|
"step": 885
|
|
},
|
|
{
|
|
"entropy": 6.4264098644256595,
|
|
"epoch": 0.08549471661863593,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004445,
|
|
"loss": 6.2553,
|
|
"mean_token_accuracy": 0.13818828240036965,
|
|
"num_tokens": 2038841.0,
|
|
"step": 890
|
|
},
|
|
{
|
|
"entropy": 6.385887289047242,
|
|
"epoch": 0.08597502401536984,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000447,
|
|
"loss": 6.3043,
|
|
"mean_token_accuracy": 0.13402576446533204,
|
|
"num_tokens": 2049905.0,
|
|
"step": 895
|
|
},
|
|
{
|
|
"entropy": 6.424469089508056,
|
|
"epoch": 0.08645533141210375,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00044950000000000003,
|
|
"loss": 6.3803,
|
|
"mean_token_accuracy": 0.13485484719276428,
|
|
"num_tokens": 2062492.0,
|
|
"step": 900
|
|
},
|
|
{
|
|
"entropy": 6.387258577346802,
|
|
"epoch": 0.08693563880883766,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00045200000000000004,
|
|
"loss": 6.31,
|
|
"mean_token_accuracy": 0.1353304862976074,
|
|
"num_tokens": 2073840.0,
|
|
"step": 905
|
|
},
|
|
{
|
|
"entropy": 6.3580629348754885,
|
|
"epoch": 0.08741594620557157,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00045450000000000004,
|
|
"loss": 6.221,
|
|
"mean_token_accuracy": 0.14060378223657607,
|
|
"num_tokens": 2085720.0,
|
|
"step": 910
|
|
},
|
|
{
|
|
"entropy": 6.353258228302002,
|
|
"epoch": 0.08789625360230548,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00045700000000000005,
|
|
"loss": 6.3039,
|
|
"mean_token_accuracy": 0.1413162462413311,
|
|
"num_tokens": 2096649.0,
|
|
"step": 915
|
|
},
|
|
{
|
|
"entropy": 6.436611890792847,
|
|
"epoch": 0.08837656099903939,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00045950000000000006,
|
|
"loss": 6.3061,
|
|
"mean_token_accuracy": 0.14285610914230346,
|
|
"num_tokens": 2109030.0,
|
|
"step": 920
|
|
},
|
|
{
|
|
"entropy": 6.35608320236206,
|
|
"epoch": 0.0888568683957733,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000462,
|
|
"loss": 6.2113,
|
|
"mean_token_accuracy": 0.14488047659397124,
|
|
"num_tokens": 2121384.0,
|
|
"step": 925
|
|
},
|
|
{
|
|
"entropy": 6.269479846954345,
|
|
"epoch": 0.0893371757925072,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004645,
|
|
"loss": 6.1635,
|
|
"mean_token_accuracy": 0.147640460729599,
|
|
"num_tokens": 2131377.0,
|
|
"step": 930
|
|
},
|
|
{
|
|
"entropy": 6.344134902954101,
|
|
"epoch": 0.08981748318924111,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000467,
|
|
"loss": 6.3531,
|
|
"mean_token_accuracy": 0.1383367098867893,
|
|
"num_tokens": 2142364.0,
|
|
"step": 935
|
|
},
|
|
{
|
|
"entropy": 6.356987571716308,
|
|
"epoch": 0.09029779058597502,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004695,
|
|
"loss": 6.2296,
|
|
"mean_token_accuracy": 0.14149210676550866,
|
|
"num_tokens": 2153040.0,
|
|
"step": 940
|
|
},
|
|
{
|
|
"entropy": 6.35843825340271,
|
|
"epoch": 0.09077809798270893,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000472,
|
|
"loss": 6.2728,
|
|
"mean_token_accuracy": 0.14314480721950532,
|
|
"num_tokens": 2165571.0,
|
|
"step": 945
|
|
},
|
|
{
|
|
"entropy": 6.3020600318908695,
|
|
"epoch": 0.09125840537944284,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004745,
|
|
"loss": 6.2423,
|
|
"mean_token_accuracy": 0.14072795882821082,
|
|
"num_tokens": 2177241.0,
|
|
"step": 950
|
|
},
|
|
{
|
|
"entropy": 6.329180097579956,
|
|
"epoch": 0.09173871277617675,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.000477,
|
|
"loss": 6.2801,
|
|
"mean_token_accuracy": 0.1361616224050522,
|
|
"num_tokens": 2187475.0,
|
|
"step": 955
|
|
},
|
|
{
|
|
"entropy": 6.315436792373657,
|
|
"epoch": 0.09221902017291066,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004795,
|
|
"loss": 6.3087,
|
|
"mean_token_accuracy": 0.14151085540652275,
|
|
"num_tokens": 2198185.0,
|
|
"step": 960
|
|
},
|
|
{
|
|
"entropy": 6.303459358215332,
|
|
"epoch": 0.09269932756964457,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.000482,
|
|
"loss": 6.2346,
|
|
"mean_token_accuracy": 0.14740882739424704,
|
|
"num_tokens": 2210404.0,
|
|
"step": 965
|
|
},
|
|
{
|
|
"entropy": 6.370419549942016,
|
|
"epoch": 0.09317963496637849,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004845,
|
|
"loss": 6.2262,
|
|
"mean_token_accuracy": 0.144054813683033,
|
|
"num_tokens": 2222188.0,
|
|
"step": 970
|
|
},
|
|
{
|
|
"entropy": 6.290718269348145,
|
|
"epoch": 0.0936599423631124,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000487,
|
|
"loss": 6.2775,
|
|
"mean_token_accuracy": 0.1421047918498516,
|
|
"num_tokens": 2233418.0,
|
|
"step": 975
|
|
},
|
|
{
|
|
"entropy": 6.352431869506836,
|
|
"epoch": 0.0941402497598463,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004895,
|
|
"loss": 6.2415,
|
|
"mean_token_accuracy": 0.14807373881340027,
|
|
"num_tokens": 2245053.0,
|
|
"step": 980
|
|
},
|
|
{
|
|
"entropy": 6.250268840789795,
|
|
"epoch": 0.09462055715658022,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.000492,
|
|
"loss": 6.2715,
|
|
"mean_token_accuracy": 0.14363499581813813,
|
|
"num_tokens": 2256375.0,
|
|
"step": 985
|
|
},
|
|
{
|
|
"entropy": 6.225133609771729,
|
|
"epoch": 0.09510086455331412,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004945,
|
|
"loss": 6.1142,
|
|
"mean_token_accuracy": 0.1477846160531044,
|
|
"num_tokens": 2267074.0,
|
|
"step": 990
|
|
},
|
|
{
|
|
"entropy": 6.191523456573487,
|
|
"epoch": 0.09558117195004803,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000497,
|
|
"loss": 6.1547,
|
|
"mean_token_accuracy": 0.14838184416294098,
|
|
"num_tokens": 2277168.0,
|
|
"step": 995
|
|
},
|
|
{
|
|
"entropy": 6.25091781616211,
|
|
"epoch": 0.09606147934678194,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004995,
|
|
"loss": 6.1381,
|
|
"mean_token_accuracy": 0.14807945489883423,
|
|
"num_tokens": 2288178.0,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"entropy": 6.215264129638672,
|
|
"epoch": 0.09654178674351585,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004999999983283737,
|
|
"loss": 6.1686,
|
|
"mean_token_accuracy": 0.1440332628786564,
|
|
"num_tokens": 2299765.0,
|
|
"step": 1005
|
|
},
|
|
{
|
|
"entropy": 6.3124645233154295,
|
|
"epoch": 0.09702209414024976,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004999999915373924,
|
|
"loss": 6.2644,
|
|
"mean_token_accuracy": 0.13689299449324607,
|
|
"num_tokens": 2312047.0,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"entropy": 6.30297064781189,
|
|
"epoch": 0.09750240153698367,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004999999795225793,
|
|
"loss": 6.2563,
|
|
"mean_token_accuracy": 0.1363622300326824,
|
|
"num_tokens": 2324118.0,
|
|
"step": 1015
|
|
},
|
|
{
|
|
"entropy": 6.299112796783447,
|
|
"epoch": 0.09798270893371758,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004999999622839347,
|
|
"loss": 6.2494,
|
|
"mean_token_accuracy": 0.14326749965548516,
|
|
"num_tokens": 2335171.0,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"entropy": 6.283253812789917,
|
|
"epoch": 0.09846301633045149,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999999398214593,
|
|
"loss": 6.1501,
|
|
"mean_token_accuracy": 0.14212532341480255,
|
|
"num_tokens": 2346338.0,
|
|
"step": 1025
|
|
},
|
|
{
|
|
"entropy": 6.212884902954102,
|
|
"epoch": 0.0989433237271854,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004999999121351532,
|
|
"loss": 6.1934,
|
|
"mean_token_accuracy": 0.14963782876729964,
|
|
"num_tokens": 2357185.0,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"entropy": 6.190281915664673,
|
|
"epoch": 0.0994236311239193,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004999998792250173,
|
|
"loss": 6.1183,
|
|
"mean_token_accuracy": 0.15685753300786018,
|
|
"num_tokens": 2368494.0,
|
|
"step": 1035
|
|
},
|
|
{
|
|
"entropy": 6.289627552032471,
|
|
"epoch": 0.09990393852065321,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004999998410910524,
|
|
"loss": 6.3364,
|
|
"mean_token_accuracy": 0.13329742476344109,
|
|
"num_tokens": 2380800.0,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"entropy": 6.3118733882904055,
|
|
"epoch": 0.10038424591738712,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999997977332592,
|
|
"loss": 6.2551,
|
|
"mean_token_accuracy": 0.13934137374162675,
|
|
"num_tokens": 2391753.0,
|
|
"step": 1045
|
|
},
|
|
{
|
|
"entropy": 6.178606843948364,
|
|
"epoch": 0.10086455331412104,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004999997491516389,
|
|
"loss": 6.1391,
|
|
"mean_token_accuracy": 0.1400229126214981,
|
|
"num_tokens": 2403324.0,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"entropy": 6.235824918746948,
|
|
"epoch": 0.10134486071085495,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004999996953461925,
|
|
"loss": 6.2482,
|
|
"mean_token_accuracy": 0.13423383459448815,
|
|
"num_tokens": 2414873.0,
|
|
"step": 1055
|
|
},
|
|
{
|
|
"entropy": 6.138184642791748,
|
|
"epoch": 0.10182516810758886,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004999996363169212,
|
|
"loss": 6.0208,
|
|
"mean_token_accuracy": 0.15671658217906953,
|
|
"num_tokens": 2425308.0,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"entropy": 6.144180011749268,
|
|
"epoch": 0.10230547550432277,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004999995720638266,
|
|
"loss": 6.0654,
|
|
"mean_token_accuracy": 0.1525282308459282,
|
|
"num_tokens": 2436835.0,
|
|
"step": 1065
|
|
},
|
|
{
|
|
"entropy": 6.183439445495606,
|
|
"epoch": 0.10278578290105668,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00049999950258691,
|
|
"loss": 6.1921,
|
|
"mean_token_accuracy": 0.1451313279569149,
|
|
"num_tokens": 2446798.0,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"entropy": 6.123720979690551,
|
|
"epoch": 0.10326609029779059,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004999994278861731,
|
|
"loss": 6.0747,
|
|
"mean_token_accuracy": 0.15084402859210969,
|
|
"num_tokens": 2457308.0,
|
|
"step": 1075
|
|
},
|
|
{
|
|
"entropy": 6.215669107437134,
|
|
"epoch": 0.1037463976945245,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004999993479616175,
|
|
"loss": 6.1309,
|
|
"mean_token_accuracy": 0.13830516785383223,
|
|
"num_tokens": 2468917.0,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"entropy": 6.227848720550537,
|
|
"epoch": 0.1042267050912584,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999992628132451,
|
|
"loss": 6.1529,
|
|
"mean_token_accuracy": 0.14558819606900214,
|
|
"num_tokens": 2481363.0,
|
|
"step": 1085
|
|
},
|
|
{
|
|
"entropy": 6.175233983993531,
|
|
"epoch": 0.10470701248799232,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999991724410582,
|
|
"loss": 6.1551,
|
|
"mean_token_accuracy": 0.14347582682967186,
|
|
"num_tokens": 2493082.0,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"entropy": 6.150361251831055,
|
|
"epoch": 0.10518731988472622,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999990768450583,
|
|
"loss": 6.106,
|
|
"mean_token_accuracy": 0.1499667778611183,
|
|
"num_tokens": 2503849.0,
|
|
"step": 1095
|
|
},
|
|
{
|
|
"entropy": 6.225272464752197,
|
|
"epoch": 0.10566762728146013,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004999989760252482,
|
|
"loss": 6.1511,
|
|
"mean_token_accuracy": 0.14817013815045357,
|
|
"num_tokens": 2514528.0,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"entropy": 6.097928714752197,
|
|
"epoch": 0.10614793467819404,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004999988699816299,
|
|
"loss": 6.1427,
|
|
"mean_token_accuracy": 0.14771459847688675,
|
|
"num_tokens": 2524971.0,
|
|
"step": 1105
|
|
},
|
|
{
|
|
"entropy": 6.153327941894531,
|
|
"epoch": 0.10662824207492795,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999987587142058,
|
|
"loss": 6.057,
|
|
"mean_token_accuracy": 0.14452041387557985,
|
|
"num_tokens": 2535674.0,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"entropy": 6.2696786403656,
|
|
"epoch": 0.10710854947166186,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999986422229789,
|
|
"loss": 6.2903,
|
|
"mean_token_accuracy": 0.13996392711997033,
|
|
"num_tokens": 2547108.0,
|
|
"step": 1115
|
|
},
|
|
{
|
|
"entropy": 6.155757236480713,
|
|
"epoch": 0.10758885686839577,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999985205079514,
|
|
"loss": 6.1047,
|
|
"mean_token_accuracy": 0.1451355442404747,
|
|
"num_tokens": 2559474.0,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"entropy": 6.012842035293579,
|
|
"epoch": 0.10806916426512968,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999983935691265,
|
|
"loss": 5.9441,
|
|
"mean_token_accuracy": 0.16244944632053376,
|
|
"num_tokens": 2571264.0,
|
|
"step": 1125
|
|
},
|
|
{
|
|
"entropy": 6.159362649917602,
|
|
"epoch": 0.10854947166186359,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000499998261406507,
|
|
"loss": 6.1208,
|
|
"mean_token_accuracy": 0.1507526934146881,
|
|
"num_tokens": 2583731.0,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"entropy": 6.268857860565186,
|
|
"epoch": 0.10902977905859751,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004999981240200958,
|
|
"loss": 6.1607,
|
|
"mean_token_accuracy": 0.14638862013816833,
|
|
"num_tokens": 2595497.0,
|
|
"step": 1135
|
|
},
|
|
{
|
|
"entropy": 6.053813219070435,
|
|
"epoch": 0.10951008645533142,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004999979814098966,
|
|
"loss": 6.1148,
|
|
"mean_token_accuracy": 0.1516471363604069,
|
|
"num_tokens": 2607358.0,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"entropy": 6.1449603080749515,
|
|
"epoch": 0.10999039385206533,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004999978335759121,
|
|
"loss": 6.0354,
|
|
"mean_token_accuracy": 0.15392047837376593,
|
|
"num_tokens": 2618936.0,
|
|
"step": 1145
|
|
},
|
|
{
|
|
"entropy": 6.154958772659302,
|
|
"epoch": 0.11047070124879924,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999976805181461,
|
|
"loss": 6.1981,
|
|
"mean_token_accuracy": 0.14167412593960763,
|
|
"num_tokens": 2631840.0,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"entropy": 6.140295743942261,
|
|
"epoch": 0.11095100864553314,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000499997522236602,
|
|
"loss": 6.1443,
|
|
"mean_token_accuracy": 0.15361175835132598,
|
|
"num_tokens": 2642412.0,
|
|
"step": 1155
|
|
},
|
|
{
|
|
"entropy": 6.160842370986939,
|
|
"epoch": 0.11143131604226705,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004999973587312837,
|
|
"loss": 6.1067,
|
|
"mean_token_accuracy": 0.14919153451919556,
|
|
"num_tokens": 2653890.0,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"entropy": 6.146590614318848,
|
|
"epoch": 0.11191162343900096,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004999971900021947,
|
|
"loss": 6.163,
|
|
"mean_token_accuracy": 0.15273661985993386,
|
|
"num_tokens": 2664888.0,
|
|
"step": 1165
|
|
},
|
|
{
|
|
"entropy": 6.159024953842163,
|
|
"epoch": 0.11239193083573487,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999970160493391,
|
|
"loss": 6.0579,
|
|
"mean_token_accuracy": 0.14569913148880004,
|
|
"num_tokens": 2675550.0,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"entropy": 6.02392611503601,
|
|
"epoch": 0.11287223823246878,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999968368727209,
|
|
"loss": 6.0724,
|
|
"mean_token_accuracy": 0.15466973930597305,
|
|
"num_tokens": 2688022.0,
|
|
"step": 1175
|
|
},
|
|
{
|
|
"entropy": 6.1862691879272464,
|
|
"epoch": 0.11335254562920269,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.0004999966524723442,
|
|
"loss": 6.0632,
|
|
"mean_token_accuracy": 0.14964798092842102,
|
|
"num_tokens": 2698737.0,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"entropy": 6.077165365219116,
|
|
"epoch": 0.1138328530259366,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004999964628482135,
|
|
"loss": 6.0344,
|
|
"mean_token_accuracy": 0.15742302685976028,
|
|
"num_tokens": 2709844.0,
|
|
"step": 1185
|
|
},
|
|
{
|
|
"entropy": 6.127112817764282,
|
|
"epoch": 0.1143131604226705,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004999962680003328,
|
|
"loss": 6.1035,
|
|
"mean_token_accuracy": 0.1519095703959465,
|
|
"num_tokens": 2720273.0,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"entropy": 6.1255943775177,
|
|
"epoch": 0.11479346781940442,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.000499996067928707,
|
|
"loss": 6.1124,
|
|
"mean_token_accuracy": 0.14679019302129745,
|
|
"num_tokens": 2731354.0,
|
|
"step": 1195
|
|
},
|
|
{
|
|
"entropy": 6.127178192138672,
|
|
"epoch": 0.11527377521613832,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999958626333406,
|
|
"loss": 6.1052,
|
|
"mean_token_accuracy": 0.1527300015091896,
|
|
"num_tokens": 2742966.0,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"entropy": 6.03611798286438,
|
|
"epoch": 0.11575408261287223,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999956521142383,
|
|
"loss": 6.009,
|
|
"mean_token_accuracy": 0.1586822062730789,
|
|
"num_tokens": 2755010.0,
|
|
"step": 1205
|
|
},
|
|
{
|
|
"entropy": 6.0991308212280275,
|
|
"epoch": 0.11623439000960614,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999954363714051,
|
|
"loss": 6.0361,
|
|
"mean_token_accuracy": 0.14981242269277573,
|
|
"num_tokens": 2766176.0,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"entropy": 6.185801792144775,
|
|
"epoch": 0.11671469740634005,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999952154048459,
|
|
"loss": 6.1829,
|
|
"mean_token_accuracy": 0.15044604614377022,
|
|
"num_tokens": 2777861.0,
|
|
"step": 1215
|
|
},
|
|
{
|
|
"entropy": 6.021704149246216,
|
|
"epoch": 0.11719500480307397,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000499994989214566,
|
|
"loss": 5.9954,
|
|
"mean_token_accuracy": 0.1536705419421196,
|
|
"num_tokens": 2788725.0,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"entropy": 6.0181561470031735,
|
|
"epoch": 0.11767531219980788,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004999947578005705,
|
|
"loss": 6.0312,
|
|
"mean_token_accuracy": 0.15193646997213364,
|
|
"num_tokens": 2801613.0,
|
|
"step": 1225
|
|
},
|
|
{
|
|
"entropy": 6.218272686004639,
|
|
"epoch": 0.11815561959654179,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004999945211628648,
|
|
"loss": 6.0986,
|
|
"mean_token_accuracy": 0.1493365317583084,
|
|
"num_tokens": 2812474.0,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"entropy": 5.971197032928467,
|
|
"epoch": 0.1186359269932757,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999942793014544,
|
|
"loss": 6.0103,
|
|
"mean_token_accuracy": 0.15563429594039918,
|
|
"num_tokens": 2823178.0,
|
|
"step": 1235
|
|
},
|
|
{
|
|
"entropy": 6.045905733108521,
|
|
"epoch": 0.11911623439000961,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.000499994032216345,
|
|
"loss": 6.0211,
|
|
"mean_token_accuracy": 0.15064174830913543,
|
|
"num_tokens": 2836486.0,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"entropy": 6.107371759414673,
|
|
"epoch": 0.11959654178674352,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999937799075422,
|
|
"loss": 6.0746,
|
|
"mean_token_accuracy": 0.1570821538567543,
|
|
"num_tokens": 2847902.0,
|
|
"step": 1245
|
|
},
|
|
{
|
|
"entropy": 5.903108596801758,
|
|
"epoch": 0.12007684918347743,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.000499993522375052,
|
|
"loss": 5.9739,
|
|
"mean_token_accuracy": 0.15461545437574387,
|
|
"num_tokens": 2859991.0,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"entropy": 6.248143100738526,
|
|
"epoch": 0.12055715658021134,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999932596188802,
|
|
"loss": 6.1545,
|
|
"mean_token_accuracy": 0.14593613222241403,
|
|
"num_tokens": 2870269.0,
|
|
"step": 1255
|
|
},
|
|
{
|
|
"entropy": 6.034249687194825,
|
|
"epoch": 0.12103746397694524,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004999929916390331,
|
|
"loss": 6.0279,
|
|
"mean_token_accuracy": 0.14597706943750383,
|
|
"num_tokens": 2882191.0,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"entropy": 5.966269588470459,
|
|
"epoch": 0.12151777137367915,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004999927184355169,
|
|
"loss": 6.0372,
|
|
"mean_token_accuracy": 0.14836430177092552,
|
|
"num_tokens": 2892775.0,
|
|
"step": 1265
|
|
},
|
|
{
|
|
"entropy": 6.147925519943238,
|
|
"epoch": 0.12199807877041306,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999924400083377,
|
|
"loss": 6.0247,
|
|
"mean_token_accuracy": 0.15831544399261474,
|
|
"num_tokens": 2904750.0,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"entropy": 6.081568050384521,
|
|
"epoch": 0.12247838616714697,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999921563575022,
|
|
"loss": 6.0988,
|
|
"mean_token_accuracy": 0.14920950308442116,
|
|
"num_tokens": 2916150.0,
|
|
"step": 1275
|
|
},
|
|
{
|
|
"entropy": 6.07696213722229,
|
|
"epoch": 0.12295869356388088,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999918674830169,
|
|
"loss": 6.0644,
|
|
"mean_token_accuracy": 0.1496642827987671,
|
|
"num_tokens": 2928452.0,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"entropy": 6.035782670974731,
|
|
"epoch": 0.12343900096061479,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999915733848886,
|
|
"loss": 6.0442,
|
|
"mean_token_accuracy": 0.1454036220908165,
|
|
"num_tokens": 2940577.0,
|
|
"step": 1285
|
|
},
|
|
{
|
|
"entropy": 6.022758436203003,
|
|
"epoch": 0.1239193083573487,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000499991274063124,
|
|
"loss": 6.0283,
|
|
"mean_token_accuracy": 0.15150520876049994,
|
|
"num_tokens": 2952302.0,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"entropy": 6.0645428657531735,
|
|
"epoch": 0.12439961575408261,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004999909695177301,
|
|
"loss": 6.0669,
|
|
"mean_token_accuracy": 0.15440516471862792,
|
|
"num_tokens": 2964611.0,
|
|
"step": 1295
|
|
},
|
|
{
|
|
"entropy": 6.0961566925048825,
|
|
"epoch": 0.12487992315081652,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.000499990659748714,
|
|
"loss": 6.05,
|
|
"mean_token_accuracy": 0.15006925463676452,
|
|
"num_tokens": 2975668.0,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"entropy": 6.146146440505982,
|
|
"epoch": 0.12536023054755044,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999903447560828,
|
|
"loss": 6.1198,
|
|
"mean_token_accuracy": 0.14781473577022552,
|
|
"num_tokens": 2987303.0,
|
|
"step": 1305
|
|
},
|
|
{
|
|
"entropy": 6.117984342575073,
|
|
"epoch": 0.12584053794428435,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.0004999900245398439,
|
|
"loss": 6.0166,
|
|
"mean_token_accuracy": 0.16036698669195176,
|
|
"num_tokens": 3000400.0,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"entropy": 6.010946893692017,
|
|
"epoch": 0.12632084534101826,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999896991000047,
|
|
"loss": 5.9477,
|
|
"mean_token_accuracy": 0.1495976448059082,
|
|
"num_tokens": 3012336.0,
|
|
"step": 1315
|
|
},
|
|
{
|
|
"entropy": 6.054377698898316,
|
|
"epoch": 0.12680115273775217,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999893684365729,
|
|
"loss": 6.0047,
|
|
"mean_token_accuracy": 0.15137309059500695,
|
|
"num_tokens": 3023004.0,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"entropy": 6.044629859924316,
|
|
"epoch": 0.12728146013448607,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004999890325495559,
|
|
"loss": 6.0922,
|
|
"mean_token_accuracy": 0.147823116928339,
|
|
"num_tokens": 3035147.0,
|
|
"step": 1325
|
|
},
|
|
{
|
|
"entropy": 6.072157478332519,
|
|
"epoch": 0.12776176753121998,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999886914389617,
|
|
"loss": 5.9177,
|
|
"mean_token_accuracy": 0.1551705077290535,
|
|
"num_tokens": 3045611.0,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"entropy": 5.916638660430908,
|
|
"epoch": 0.1282420749279539,
|
|
"grad_norm": 0.92578125,
|
|
"learning_rate": 0.0004999883451047981,
|
|
"loss": 5.9296,
|
|
"mean_token_accuracy": 0.1561925306916237,
|
|
"num_tokens": 3056420.0,
|
|
"step": 1335
|
|
},
|
|
{
|
|
"entropy": 5.977782440185547,
|
|
"epoch": 0.1287223823246878,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004999879935470733,
|
|
"loss": 5.9227,
|
|
"mean_token_accuracy": 0.15750788599252702,
|
|
"num_tokens": 3068770.0,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"entropy": 6.05616979598999,
|
|
"epoch": 0.1292026897214217,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999876367657954,
|
|
"loss": 6.0521,
|
|
"mean_token_accuracy": 0.14580482840538025,
|
|
"num_tokens": 3080806.0,
|
|
"step": 1345
|
|
},
|
|
{
|
|
"entropy": 6.143747854232788,
|
|
"epoch": 0.12968299711815562,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999872747609725,
|
|
"loss": 6.0742,
|
|
"mean_token_accuracy": 0.1484417587518692,
|
|
"num_tokens": 3091769.0,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"entropy": 5.9879156112670895,
|
|
"epoch": 0.13016330451488953,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004999869075326132,
|
|
"loss": 5.9938,
|
|
"mean_token_accuracy": 0.15191702395677567,
|
|
"num_tokens": 3103121.0,
|
|
"step": 1355
|
|
},
|
|
{
|
|
"entropy": 6.010816240310669,
|
|
"epoch": 0.13064361191162344,
|
|
"grad_norm": 0.890625,
|
|
"learning_rate": 0.000499986535080726,
|
|
"loss": 5.9724,
|
|
"mean_token_accuracy": 0.16233935654163362,
|
|
"num_tokens": 3115606.0,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"entropy": 6.026129817962646,
|
|
"epoch": 0.13112391930835735,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.0004999861574053196,
|
|
"loss": 5.8723,
|
|
"mean_token_accuracy": 0.16096271872520446,
|
|
"num_tokens": 3127961.0,
|
|
"step": 1365
|
|
},
|
|
{
|
|
"entropy": 5.87260947227478,
|
|
"epoch": 0.13160422670509125,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999857745064027,
|
|
"loss": 5.8905,
|
|
"mean_token_accuracy": 0.15895691215991975,
|
|
"num_tokens": 3138316.0,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"entropy": 5.953699588775635,
|
|
"epoch": 0.13208453410182516,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 0.000499985386383984,
|
|
"loss": 5.8671,
|
|
"mean_token_accuracy": 0.15866711735725403,
|
|
"num_tokens": 3150818.0,
|
|
"step": 1375
|
|
},
|
|
{
|
|
"entropy": 6.006815195083618,
|
|
"epoch": 0.13256484149855907,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004999849930380729,
|
|
"loss": 6.0195,
|
|
"mean_token_accuracy": 0.1508159779012203,
|
|
"num_tokens": 3162066.0,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"entropy": 5.941660642623901,
|
|
"epoch": 0.13304514889529298,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999845944686781,
|
|
"loss": 5.9924,
|
|
"mean_token_accuracy": 0.1508888617157936,
|
|
"num_tokens": 3172209.0,
|
|
"step": 1385
|
|
},
|
|
{
|
|
"entropy": 5.954594707489013,
|
|
"epoch": 0.1335254562920269,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004999841906758093,
|
|
"loss": 5.8218,
|
|
"mean_token_accuracy": 0.1675858825445175,
|
|
"num_tokens": 3183248.0,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"entropy": 5.94215030670166,
|
|
"epoch": 0.1340057636887608,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999837816594757,
|
|
"loss": 5.9139,
|
|
"mean_token_accuracy": 0.15847276002168656,
|
|
"num_tokens": 3194748.0,
|
|
"step": 1395
|
|
},
|
|
{
|
|
"entropy": 5.930553770065307,
|
|
"epoch": 0.1344860710854947,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999833674196865,
|
|
"loss": 5.8849,
|
|
"mean_token_accuracy": 0.16950529664754868,
|
|
"num_tokens": 3205669.0,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"entropy": 5.932918214797974,
|
|
"epoch": 0.13496637848222862,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999829479564518,
|
|
"loss": 5.9807,
|
|
"mean_token_accuracy": 0.14995542094111441,
|
|
"num_tokens": 3216035.0,
|
|
"step": 1405
|
|
},
|
|
{
|
|
"entropy": 6.064324188232422,
|
|
"epoch": 0.13544668587896252,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000499982523269781,
|
|
"loss": 5.9647,
|
|
"mean_token_accuracy": 0.15931690335273743,
|
|
"num_tokens": 3227192.0,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"entropy": 5.975619888305664,
|
|
"epoch": 0.13592699327569643,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0004999820933596842,
|
|
"loss": 5.9871,
|
|
"mean_token_accuracy": 0.15620121210813523,
|
|
"num_tokens": 3240237.0,
|
|
"step": 1415
|
|
},
|
|
{
|
|
"entropy": 5.962911701202392,
|
|
"epoch": 0.13640730067243034,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000499981658226171,
|
|
"loss": 5.8734,
|
|
"mean_token_accuracy": 0.16469697579741477,
|
|
"num_tokens": 3251963.0,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"entropy": 5.908741474151611,
|
|
"epoch": 0.13688760806916425,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.000499981217869252,
|
|
"loss": 5.9953,
|
|
"mean_token_accuracy": 0.15814436972141266,
|
|
"num_tokens": 3263101.0,
|
|
"step": 1425
|
|
},
|
|
{
|
|
"entropy": 5.985613679885864,
|
|
"epoch": 0.1373679154658982,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000499980772288937,
|
|
"loss": 5.8679,
|
|
"mean_token_accuracy": 0.16649020761251448,
|
|
"num_tokens": 3275100.0,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"entropy": 5.945235109329223,
|
|
"epoch": 0.1378482228626321,
|
|
"grad_norm": 0.9140625,
|
|
"learning_rate": 0.0004999803214852367,
|
|
"loss": 5.9638,
|
|
"mean_token_accuracy": 0.15565589517354966,
|
|
"num_tokens": 3287025.0,
|
|
"step": 1435
|
|
},
|
|
{
|
|
"entropy": 6.04934253692627,
|
|
"epoch": 0.138328530259366,
|
|
"grad_norm": 0.91796875,
|
|
"learning_rate": 0.0004999798654581613,
|
|
"loss": 5.9662,
|
|
"mean_token_accuracy": 0.15883919447660447,
|
|
"num_tokens": 3299867.0,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"entropy": 5.918570852279663,
|
|
"epoch": 0.13880883765609991,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999794042077214,
|
|
"loss": 5.9038,
|
|
"mean_token_accuracy": 0.16191874593496322,
|
|
"num_tokens": 3311183.0,
|
|
"step": 1445
|
|
},
|
|
{
|
|
"entropy": 5.952925539016723,
|
|
"epoch": 0.13928914505283382,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004999789377339279,
|
|
"loss": 5.9687,
|
|
"mean_token_accuracy": 0.15641413480043412,
|
|
"num_tokens": 3322247.0,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"entropy": 5.962415742874145,
|
|
"epoch": 0.13976945244956773,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999784660367915,
|
|
"loss": 5.8826,
|
|
"mean_token_accuracy": 0.1588966131210327,
|
|
"num_tokens": 3333369.0,
|
|
"step": 1455
|
|
},
|
|
{
|
|
"entropy": 5.904612874984741,
|
|
"epoch": 0.14024975984630164,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999779891163231,
|
|
"loss": 5.9113,
|
|
"mean_token_accuracy": 0.16011089235544204,
|
|
"num_tokens": 3345876.0,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"entropy": 5.91278772354126,
|
|
"epoch": 0.14073006724303555,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999775069725339,
|
|
"loss": 5.8124,
|
|
"mean_token_accuracy": 0.1629629462957382,
|
|
"num_tokens": 3357323.0,
|
|
"step": 1465
|
|
},
|
|
{
|
|
"entropy": 5.912459039688111,
|
|
"epoch": 0.14121037463976946,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000499977019605435,
|
|
"loss": 5.897,
|
|
"mean_token_accuracy": 0.15947655588388443,
|
|
"num_tokens": 3367689.0,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"entropy": 5.844752836227417,
|
|
"epoch": 0.14169068203650337,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004999765270150378,
|
|
"loss": 5.8568,
|
|
"mean_token_accuracy": 0.15955205261707306,
|
|
"num_tokens": 3379472.0,
|
|
"step": 1475
|
|
},
|
|
{
|
|
"entropy": 5.996302938461303,
|
|
"epoch": 0.14217098943323728,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999760292013536,
|
|
"loss": 5.8922,
|
|
"mean_token_accuracy": 0.15859662368893623,
|
|
"num_tokens": 3390929.0,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"entropy": 5.99014687538147,
|
|
"epoch": 0.14265129682997119,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999755261643941,
|
|
"loss": 5.8976,
|
|
"mean_token_accuracy": 0.16287715286016463,
|
|
"num_tokens": 3401242.0,
|
|
"step": 1485
|
|
},
|
|
{
|
|
"entropy": 5.869934892654419,
|
|
"epoch": 0.1431316042267051,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999750179041709,
|
|
"loss": 5.8878,
|
|
"mean_token_accuracy": 0.16124220937490463,
|
|
"num_tokens": 3411169.0,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"entropy": 5.874157810211182,
|
|
"epoch": 0.143611911623439,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999745044206959,
|
|
"loss": 5.7279,
|
|
"mean_token_accuracy": 0.16647156924009324,
|
|
"num_tokens": 3423265.0,
|
|
"step": 1495
|
|
},
|
|
{
|
|
"entropy": 5.832660913467407,
|
|
"epoch": 0.1440922190201729,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0004999739857139809,
|
|
"loss": 5.8347,
|
|
"mean_token_accuracy": 0.16908216327428818,
|
|
"num_tokens": 3434793.0,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"entropy": 5.757522106170654,
|
|
"epoch": 0.14457252641690682,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.000499973461784038,
|
|
"loss": 5.7679,
|
|
"mean_token_accuracy": 0.17928926199674605,
|
|
"num_tokens": 3445732.0,
|
|
"step": 1505
|
|
},
|
|
{
|
|
"entropy": 5.942258501052857,
|
|
"epoch": 0.14505283381364073,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004999729326308792,
|
|
"loss": 5.9516,
|
|
"mean_token_accuracy": 0.15832037180662156,
|
|
"num_tokens": 3457090.0,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"entropy": 5.99946174621582,
|
|
"epoch": 0.14553314121037464,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000499972398254517,
|
|
"loss": 5.9388,
|
|
"mean_token_accuracy": 0.15340567082166673,
|
|
"num_tokens": 3468087.0,
|
|
"step": 1515
|
|
},
|
|
{
|
|
"entropy": 5.941799163818359,
|
|
"epoch": 0.14601344860710855,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000499971858654964,
|
|
"loss": 5.8778,
|
|
"mean_token_accuracy": 0.1609287366271019,
|
|
"num_tokens": 3478820.0,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"entropy": 5.859274196624756,
|
|
"epoch": 0.14649375600384246,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004999713138322321,
|
|
"loss": 5.9021,
|
|
"mean_token_accuracy": 0.15754427909851074,
|
|
"num_tokens": 3489878.0,
|
|
"step": 1525
|
|
},
|
|
{
|
|
"entropy": 5.942076396942139,
|
|
"epoch": 0.14697406340057637,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999707637863346,
|
|
"loss": 5.8905,
|
|
"mean_token_accuracy": 0.1585473045706749,
|
|
"num_tokens": 3500944.0,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"entropy": 5.8406360149383545,
|
|
"epoch": 0.14745437079731027,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999702085172838,
|
|
"loss": 5.8719,
|
|
"mean_token_accuracy": 0.16607238352298737,
|
|
"num_tokens": 3511383.0,
|
|
"step": 1535
|
|
},
|
|
{
|
|
"entropy": 5.969763612747192,
|
|
"epoch": 0.14793467819404418,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0004999696480250929,
|
|
"loss": 5.963,
|
|
"mean_token_accuracy": 0.15430965945124625,
|
|
"num_tokens": 3523300.0,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"entropy": 5.970634698867798,
|
|
"epoch": 0.1484149855907781,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004999690823097747,
|
|
"loss": 5.8799,
|
|
"mean_token_accuracy": 0.1521039791405201,
|
|
"num_tokens": 3534371.0,
|
|
"step": 1545
|
|
},
|
|
{
|
|
"entropy": 5.841155576705932,
|
|
"epoch": 0.148895292987512,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004999685113713426,
|
|
"loss": 5.8552,
|
|
"mean_token_accuracy": 0.16120514869689942,
|
|
"num_tokens": 3544847.0,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"entropy": 5.92685284614563,
|
|
"epoch": 0.1493756003842459,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004999679352098096,
|
|
"loss": 5.8223,
|
|
"mean_token_accuracy": 0.16645588725805283,
|
|
"num_tokens": 3555859.0,
|
|
"step": 1555
|
|
},
|
|
{
|
|
"entropy": 5.8343531608581545,
|
|
"epoch": 0.14985590778097982,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.0004999673538251891,
|
|
"loss": 5.8389,
|
|
"mean_token_accuracy": 0.15894080251455306,
|
|
"num_tokens": 3568283.0,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"entropy": 5.834793663024902,
|
|
"epoch": 0.15033621517771373,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0004999667672174947,
|
|
"loss": 5.917,
|
|
"mean_token_accuracy": 0.1583700641989708,
|
|
"num_tokens": 3581442.0,
|
|
"step": 1565
|
|
},
|
|
{
|
|
"entropy": 6.0175745487213135,
|
|
"epoch": 0.15081652257444764,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00049996617538674,
|
|
"loss": 5.9571,
|
|
"mean_token_accuracy": 0.15496992468833923,
|
|
"num_tokens": 3594055.0,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"entropy": 5.962413930892945,
|
|
"epoch": 0.15129682997118155,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004999655783329386,
|
|
"loss": 5.9187,
|
|
"mean_token_accuracy": 0.15283605754375457,
|
|
"num_tokens": 3605952.0,
|
|
"step": 1575
|
|
},
|
|
{
|
|
"entropy": 5.910793209075928,
|
|
"epoch": 0.15177713736791545,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004999649760561046,
|
|
"loss": 5.9577,
|
|
"mean_token_accuracy": 0.158383572101593,
|
|
"num_tokens": 3618544.0,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"entropy": 5.908201408386231,
|
|
"epoch": 0.15225744476464936,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999643685562519,
|
|
"loss": 5.8929,
|
|
"mean_token_accuracy": 0.16440413743257523,
|
|
"num_tokens": 3630445.0,
|
|
"step": 1585
|
|
},
|
|
{
|
|
"entropy": 5.935053777694702,
|
|
"epoch": 0.15273775216138327,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999637558333945,
|
|
"loss": 5.8797,
|
|
"mean_token_accuracy": 0.16155748218297958,
|
|
"num_tokens": 3642516.0,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"entropy": 5.843541431427002,
|
|
"epoch": 0.15321805955811718,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999631378875467,
|
|
"loss": 5.8175,
|
|
"mean_token_accuracy": 0.16581382006406784,
|
|
"num_tokens": 3654425.0,
|
|
"step": 1595
|
|
},
|
|
{
|
|
"entropy": 5.805763053894043,
|
|
"epoch": 0.15369836695485112,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004999625147187228,
|
|
"loss": 5.8228,
|
|
"mean_token_accuracy": 0.16464165300130845,
|
|
"num_tokens": 3666521.0,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"entropy": 6.019205856323242,
|
|
"epoch": 0.15417867435158503,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.0004999618863269373,
|
|
"loss": 5.8806,
|
|
"mean_token_accuracy": 0.15575164407491685,
|
|
"num_tokens": 3679121.0,
|
|
"step": 1605
|
|
},
|
|
{
|
|
"entropy": 5.91282377243042,
|
|
"epoch": 0.15465898174831894,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999612527122049,
|
|
"loss": 5.8941,
|
|
"mean_token_accuracy": 0.15461272597312928,
|
|
"num_tokens": 3691095.0,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"entropy": 5.826972103118896,
|
|
"epoch": 0.15513928914505284,
|
|
"grad_norm": 0.87109375,
|
|
"learning_rate": 0.0004999606138745402,
|
|
"loss": 5.8562,
|
|
"mean_token_accuracy": 0.16407538801431656,
|
|
"num_tokens": 3703426.0,
|
|
"step": 1615
|
|
},
|
|
{
|
|
"entropy": 5.967412042617798,
|
|
"epoch": 0.15561959654178675,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004999599698139581,
|
|
"loss": 5.9309,
|
|
"mean_token_accuracy": 0.1637990355491638,
|
|
"num_tokens": 3715429.0,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"entropy": 5.932253503799439,
|
|
"epoch": 0.15609990393852066,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999593205304734,
|
|
"loss": 5.909,
|
|
"mean_token_accuracy": 0.15584128946065903,
|
|
"num_tokens": 3726327.0,
|
|
"step": 1625
|
|
},
|
|
{
|
|
"entropy": 5.9037374496459964,
|
|
"epoch": 0.15658021133525457,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004999586660241012,
|
|
"loss": 5.8582,
|
|
"mean_token_accuracy": 0.1553866222500801,
|
|
"num_tokens": 3736818.0,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"entropy": 5.929326868057251,
|
|
"epoch": 0.15706051873198848,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004999580062948569,
|
|
"loss": 5.8583,
|
|
"mean_token_accuracy": 0.16254822611808778,
|
|
"num_tokens": 3747776.0,
|
|
"step": 1635
|
|
},
|
|
{
|
|
"entropy": 5.7625970363616945,
|
|
"epoch": 0.1575408261287224,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004999573413427556,
|
|
"loss": 5.7301,
|
|
"mean_token_accuracy": 0.164338056743145,
|
|
"num_tokens": 3758990.0,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"entropy": 5.8398857593536375,
|
|
"epoch": 0.1580211335254563,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0004999566711678128,
|
|
"loss": 5.7961,
|
|
"mean_token_accuracy": 0.1605479434132576,
|
|
"num_tokens": 3769686.0,
|
|
"step": 1645
|
|
},
|
|
{
|
|
"entropy": 5.867894649505615,
|
|
"epoch": 0.1585014409221902,
|
|
"grad_norm": 0.92578125,
|
|
"learning_rate": 0.0004999559957700442,
|
|
"loss": 5.8554,
|
|
"mean_token_accuracy": 0.16354380249977113,
|
|
"num_tokens": 3781815.0,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"entropy": 5.88207426071167,
|
|
"epoch": 0.15898174831892412,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004999553151494653,
|
|
"loss": 5.9139,
|
|
"mean_token_accuracy": 0.15942219495773316,
|
|
"num_tokens": 3793392.0,
|
|
"step": 1655
|
|
},
|
|
{
|
|
"entropy": 5.860579538345337,
|
|
"epoch": 0.15946205571565802,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999546293060919,
|
|
"loss": 5.8298,
|
|
"mean_token_accuracy": 0.16041782200336457,
|
|
"num_tokens": 3804974.0,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"entropy": 5.799793004989624,
|
|
"epoch": 0.15994236311239193,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.00049995393823994,
|
|
"loss": 5.7028,
|
|
"mean_token_accuracy": 0.17192372530698777,
|
|
"num_tokens": 3817166.0,
|
|
"step": 1665
|
|
},
|
|
{
|
|
"entropy": 5.849306297302246,
|
|
"epoch": 0.16042267050912584,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999532419510255,
|
|
"loss": 5.8307,
|
|
"mean_token_accuracy": 0.1580624461174011,
|
|
"num_tokens": 3828151.0,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"entropy": 5.847281789779663,
|
|
"epoch": 0.16090297790585975,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.000499952540439365,
|
|
"loss": 5.8283,
|
|
"mean_token_accuracy": 0.16032543033361435,
|
|
"num_tokens": 3839439.0,
|
|
"step": 1675
|
|
},
|
|
{
|
|
"entropy": 5.906755828857422,
|
|
"epoch": 0.16138328530259366,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0004999518337049743,
|
|
"loss": 5.8813,
|
|
"mean_token_accuracy": 0.15963228195905685,
|
|
"num_tokens": 3851694.0,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"entropy": 5.831542205810547,
|
|
"epoch": 0.16186359269932757,
|
|
"grad_norm": 0.91015625,
|
|
"learning_rate": 0.00049995112174787,
|
|
"loss": 5.8589,
|
|
"mean_token_accuracy": 0.15917099863290787,
|
|
"num_tokens": 3863593.0,
|
|
"step": 1685
|
|
},
|
|
{
|
|
"entropy": 5.811672306060791,
|
|
"epoch": 0.16234390009606148,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0004999504045680687,
|
|
"loss": 5.7935,
|
|
"mean_token_accuracy": 0.1701650395989418,
|
|
"num_tokens": 3874588.0,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"entropy": 5.894420862197876,
|
|
"epoch": 0.1628242074927954,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999496821655869,
|
|
"loss": 5.8753,
|
|
"mean_token_accuracy": 0.16022350043058395,
|
|
"num_tokens": 3884662.0,
|
|
"step": 1695
|
|
},
|
|
{
|
|
"entropy": 5.956241655349731,
|
|
"epoch": 0.1633045148895293,
|
|
"grad_norm": 0.890625,
|
|
"learning_rate": 0.0004999489545404414,
|
|
"loss": 5.9739,
|
|
"mean_token_accuracy": 0.15092033073306083,
|
|
"num_tokens": 3896569.0,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"entropy": 5.943658018112183,
|
|
"epoch": 0.1637848222862632,
|
|
"grad_norm": 0.8984375,
|
|
"learning_rate": 0.0004999482216926493,
|
|
"loss": 5.8162,
|
|
"mean_token_accuracy": 0.1632000833749771,
|
|
"num_tokens": 3907691.0,
|
|
"step": 1705
|
|
},
|
|
{
|
|
"entropy": 5.843317651748658,
|
|
"epoch": 0.1642651296829971,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999474836222273,
|
|
"loss": 5.83,
|
|
"mean_token_accuracy": 0.1665841408073902,
|
|
"num_tokens": 3918794.0,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"entropy": 5.834485340118408,
|
|
"epoch": 0.16474543707973102,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.0004999467403291928,
|
|
"loss": 5.8301,
|
|
"mean_token_accuracy": 0.1692491739988327,
|
|
"num_tokens": 3929773.0,
|
|
"step": 1715
|
|
},
|
|
{
|
|
"entropy": 5.874946594238281,
|
|
"epoch": 0.16522574447646493,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999459918135628,
|
|
"loss": 5.8498,
|
|
"mean_token_accuracy": 0.16062923073768615,
|
|
"num_tokens": 3940264.0,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"entropy": 5.791439247131348,
|
|
"epoch": 0.16570605187319884,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.000499945238075355,
|
|
"loss": 5.7456,
|
|
"mean_token_accuracy": 0.1693306788802147,
|
|
"num_tokens": 3951500.0,
|
|
"step": 1725
|
|
},
|
|
{
|
|
"entropy": 5.851829910278321,
|
|
"epoch": 0.16618635926993275,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999444791145865,
|
|
"loss": 5.8145,
|
|
"mean_token_accuracy": 0.16588351577520372,
|
|
"num_tokens": 3963580.0,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"entropy": 5.804158353805542,
|
|
"epoch": 0.16666666666666666,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.0004999437149312754,
|
|
"loss": 5.7585,
|
|
"mean_token_accuracy": 0.17176578491926192,
|
|
"num_tokens": 3975994.0,
|
|
"step": 1735
|
|
},
|
|
{
|
|
"entropy": 5.836318635940552,
|
|
"epoch": 0.16714697406340057,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000499942945525439,
|
|
"loss": 5.7658,
|
|
"mean_token_accuracy": 0.15896687656641006,
|
|
"num_tokens": 3987897.0,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"entropy": 5.888211059570312,
|
|
"epoch": 0.16762728146013448,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999421708970954,
|
|
"loss": 5.93,
|
|
"mean_token_accuracy": 0.15537445321679116,
|
|
"num_tokens": 3999829.0,
|
|
"step": 1745
|
|
},
|
|
{
|
|
"entropy": 5.7658594131469725,
|
|
"epoch": 0.16810758885686838,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004999413910462625,
|
|
"loss": 5.7591,
|
|
"mean_token_accuracy": 0.16620118021965027,
|
|
"num_tokens": 4010882.0,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"entropy": 5.861884737014771,
|
|
"epoch": 0.1685878962536023,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.0004999406059729586,
|
|
"loss": 5.7469,
|
|
"mean_token_accuracy": 0.17034892737865448,
|
|
"num_tokens": 4021423.0,
|
|
"step": 1755
|
|
},
|
|
{
|
|
"entropy": 5.888075494766236,
|
|
"epoch": 0.1690682036503362,
|
|
"grad_norm": 0.921875,
|
|
"learning_rate": 0.0004999398156772016,
|
|
"loss": 5.8931,
|
|
"mean_token_accuracy": 0.15374189764261245,
|
|
"num_tokens": 4033590.0,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"entropy": 5.721970653533935,
|
|
"epoch": 0.16954851104707014,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00049993902015901,
|
|
"loss": 5.7562,
|
|
"mean_token_accuracy": 0.16655992865562438,
|
|
"num_tokens": 4043978.0,
|
|
"step": 1765
|
|
},
|
|
{
|
|
"entropy": 5.931190156936646,
|
|
"epoch": 0.17002881844380405,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999382194184023,
|
|
"loss": 5.8756,
|
|
"mean_token_accuracy": 0.16273052543401717,
|
|
"num_tokens": 4054513.0,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"entropy": 5.857993745803833,
|
|
"epoch": 0.17050912584053796,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.0004999374134553972,
|
|
"loss": 5.8367,
|
|
"mean_token_accuracy": 0.16276317089796066,
|
|
"num_tokens": 4066019.0,
|
|
"step": 1775
|
|
},
|
|
{
|
|
"entropy": 5.841061735153199,
|
|
"epoch": 0.17098943323727187,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.0004999366022700131,
|
|
"loss": 5.7935,
|
|
"mean_token_accuracy": 0.1673088401556015,
|
|
"num_tokens": 4077688.0,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"entropy": 5.860415935516357,
|
|
"epoch": 0.17146974063400577,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004999357858622691,
|
|
"loss": 5.8573,
|
|
"mean_token_accuracy": 0.1664716601371765,
|
|
"num_tokens": 4089803.0,
|
|
"step": 1785
|
|
},
|
|
{
|
|
"entropy": 5.8289069652557375,
|
|
"epoch": 0.17195004803073968,
|
|
"grad_norm": 0.91796875,
|
|
"learning_rate": 0.0004999349642321842,
|
|
"loss": 5.8073,
|
|
"mean_token_accuracy": 0.16912547051906585,
|
|
"num_tokens": 4101969.0,
|
|
"step": 1790
|
|
},
|
|
{
|
|
"entropy": 5.799117517471314,
|
|
"epoch": 0.1724303554274736,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004999341373797772,
|
|
"loss": 5.7955,
|
|
"mean_token_accuracy": 0.15957102179527283,
|
|
"num_tokens": 4113567.0,
|
|
"step": 1795
|
|
},
|
|
{
|
|
"entropy": 5.814974451065064,
|
|
"epoch": 0.1729106628242075,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004999333053050675,
|
|
"loss": 5.7575,
|
|
"mean_token_accuracy": 0.1691056177020073,
|
|
"num_tokens": 4125191.0,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"entropy": 5.827954626083374,
|
|
"epoch": 0.1733909702209414,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004999324680080744,
|
|
"loss": 5.8004,
|
|
"mean_token_accuracy": 0.16687883883714677,
|
|
"num_tokens": 4135050.0,
|
|
"step": 1805
|
|
},
|
|
{
|
|
"entropy": 5.842863750457764,
|
|
"epoch": 0.17387127761767532,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.0004999316254888172,
|
|
"loss": 5.8736,
|
|
"mean_token_accuracy": 0.1648238182067871,
|
|
"num_tokens": 4146874.0,
|
|
"step": 1810
|
|
},
|
|
{
|
|
"entropy": 5.857775688171387,
|
|
"epoch": 0.17435158501440923,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.0004999307777473157,
|
|
"loss": 5.7974,
|
|
"mean_token_accuracy": 0.16151650995016098,
|
|
"num_tokens": 4158118.0,
|
|
"step": 1815
|
|
},
|
|
{
|
|
"entropy": 5.818978691101075,
|
|
"epoch": 0.17483189241114314,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004999299247835893,
|
|
"loss": 5.7561,
|
|
"mean_token_accuracy": 0.17479462176561356,
|
|
"num_tokens": 4169035.0,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"entropy": 5.738432455062866,
|
|
"epoch": 0.17531219980787704,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000499929066597658,
|
|
"loss": 5.745,
|
|
"mean_token_accuracy": 0.17148349434137344,
|
|
"num_tokens": 4180314.0,
|
|
"step": 1825
|
|
},
|
|
{
|
|
"entropy": 5.883955717086792,
|
|
"epoch": 0.17579250720461095,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999282031895418,
|
|
"loss": 5.8239,
|
|
"mean_token_accuracy": 0.16614590883255004,
|
|
"num_tokens": 4192238.0,
|
|
"step": 1830
|
|
},
|
|
{
|
|
"entropy": 5.769097232818604,
|
|
"epoch": 0.17627281460134486,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999273345592604,
|
|
"loss": 5.756,
|
|
"mean_token_accuracy": 0.16652164459228516,
|
|
"num_tokens": 4203346.0,
|
|
"step": 1835
|
|
},
|
|
{
|
|
"entropy": 5.811061954498291,
|
|
"epoch": 0.17675312199807877,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004999264607068343,
|
|
"loss": 5.8159,
|
|
"mean_token_accuracy": 0.17016567289829254,
|
|
"num_tokens": 4213763.0,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"entropy": 5.781940555572509,
|
|
"epoch": 0.17723342939481268,
|
|
"grad_norm": 0.90234375,
|
|
"learning_rate": 0.0004999255816322837,
|
|
"loss": 5.7699,
|
|
"mean_token_accuracy": 0.16876950412988662,
|
|
"num_tokens": 4225553.0,
|
|
"step": 1845
|
|
},
|
|
{
|
|
"entropy": 5.857665061950684,
|
|
"epoch": 0.1777137367915466,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.000499924697335629,
|
|
"loss": 5.702,
|
|
"mean_token_accuracy": 0.17350574135780333,
|
|
"num_tokens": 4236058.0,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"entropy": 5.640166330337524,
|
|
"epoch": 0.1781940441882805,
|
|
"grad_norm": 0.92578125,
|
|
"learning_rate": 0.0004999238078168906,
|
|
"loss": 5.7763,
|
|
"mean_token_accuracy": 0.17054813206195832,
|
|
"num_tokens": 4248299.0,
|
|
"step": 1855
|
|
},
|
|
{
|
|
"entropy": 5.8273721694946286,
|
|
"epoch": 0.1786743515850144,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.0004999229130760894,
|
|
"loss": 5.7052,
|
|
"mean_token_accuracy": 0.17111807465553283,
|
|
"num_tokens": 4259704.0,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"entropy": 5.691127586364746,
|
|
"epoch": 0.17915465898174832,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.000499922013113246,
|
|
"loss": 5.587,
|
|
"mean_token_accuracy": 0.18398697525262833,
|
|
"num_tokens": 4270480.0,
|
|
"step": 1865
|
|
},
|
|
{
|
|
"entropy": 5.780127954483032,
|
|
"epoch": 0.17963496637848222,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999211079283814,
|
|
"loss": 5.8538,
|
|
"mean_token_accuracy": 0.16719998568296432,
|
|
"num_tokens": 4282104.0,
|
|
"step": 1870
|
|
},
|
|
{
|
|
"entropy": 5.849603605270386,
|
|
"epoch": 0.18011527377521613,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.0004999201975215164,
|
|
"loss": 5.8172,
|
|
"mean_token_accuracy": 0.16666848957538605,
|
|
"num_tokens": 4294251.0,
|
|
"step": 1875
|
|
},
|
|
{
|
|
"entropy": 5.757232236862182,
|
|
"epoch": 0.18059558117195004,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0004999192818926725,
|
|
"loss": 5.7017,
|
|
"mean_token_accuracy": 0.16847867369651795,
|
|
"num_tokens": 4305569.0,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"entropy": 5.859993028640747,
|
|
"epoch": 0.18107588856868395,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999183610418706,
|
|
"loss": 5.8283,
|
|
"mean_token_accuracy": 0.16413767859339715,
|
|
"num_tokens": 4317845.0,
|
|
"step": 1885
|
|
},
|
|
{
|
|
"entropy": 5.76594557762146,
|
|
"epoch": 0.18155619596541786,
|
|
"grad_norm": 0.90625,
|
|
"learning_rate": 0.0004999174349691322,
|
|
"loss": 5.6959,
|
|
"mean_token_accuracy": 0.17179392874240876,
|
|
"num_tokens": 4329987.0,
|
|
"step": 1890
|
|
},
|
|
{
|
|
"entropy": 5.697657203674316,
|
|
"epoch": 0.18203650336215177,
|
|
"grad_norm": 0.88671875,
|
|
"learning_rate": 0.0004999165036744788,
|
|
"loss": 5.7257,
|
|
"mean_token_accuracy": 0.16847490072250365,
|
|
"num_tokens": 4341628.0,
|
|
"step": 1895
|
|
},
|
|
{
|
|
"entropy": 5.861244201660156,
|
|
"epoch": 0.18251681075888568,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999155671579322,
|
|
"loss": 5.7851,
|
|
"mean_token_accuracy": 0.1615397110581398,
|
|
"num_tokens": 4352379.0,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"entropy": 5.6849024295806885,
|
|
"epoch": 0.1829971181556196,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000499914625419514,
|
|
"loss": 5.7181,
|
|
"mean_token_accuracy": 0.171738800406456,
|
|
"num_tokens": 4364800.0,
|
|
"step": 1905
|
|
},
|
|
{
|
|
"entropy": 5.776795959472656,
|
|
"epoch": 0.1834774255523535,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999136784592459,
|
|
"loss": 5.7315,
|
|
"mean_token_accuracy": 0.16872817426919937,
|
|
"num_tokens": 4376048.0,
|
|
"step": 1910
|
|
},
|
|
{
|
|
"entropy": 5.730347061157227,
|
|
"epoch": 0.1839577329490874,
|
|
"grad_norm": 0.921875,
|
|
"learning_rate": 0.0004999127262771502,
|
|
"loss": 5.7297,
|
|
"mean_token_accuracy": 0.16825871765613556,
|
|
"num_tokens": 4388072.0,
|
|
"step": 1915
|
|
},
|
|
{
|
|
"entropy": 5.872533082962036,
|
|
"epoch": 0.1844380403458213,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999117688732487,
|
|
"loss": 5.8226,
|
|
"mean_token_accuracy": 0.16391085535287858,
|
|
"num_tokens": 4399843.0,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"entropy": 5.713910245895386,
|
|
"epoch": 0.18491834774255522,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999108062475638,
|
|
"loss": 5.6757,
|
|
"mean_token_accuracy": 0.17384760677814484,
|
|
"num_tokens": 4411373.0,
|
|
"step": 1925
|
|
},
|
|
{
|
|
"entropy": 5.716005563735962,
|
|
"epoch": 0.18539865513928913,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000499909838400118,
|
|
"loss": 5.6614,
|
|
"mean_token_accuracy": 0.173922398686409,
|
|
"num_tokens": 4421857.0,
|
|
"step": 1930
|
|
},
|
|
{
|
|
"entropy": 5.820113229751587,
|
|
"epoch": 0.18587896253602307,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999088653309334,
|
|
"loss": 5.7618,
|
|
"mean_token_accuracy": 0.1711716189980507,
|
|
"num_tokens": 4432728.0,
|
|
"step": 1935
|
|
},
|
|
{
|
|
"entropy": 5.708466053009033,
|
|
"epoch": 0.18635926993275698,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.0004999078870400329,
|
|
"loss": 5.693,
|
|
"mean_token_accuracy": 0.17283684760332108,
|
|
"num_tokens": 4444683.0,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"entropy": 5.8614743709564205,
|
|
"epoch": 0.18683957732949089,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.0004999069035274391,
|
|
"loss": 5.8215,
|
|
"mean_token_accuracy": 0.16018551886081694,
|
|
"num_tokens": 4456961.0,
|
|
"step": 1945
|
|
},
|
|
{
|
|
"entropy": 5.694478511810303,
|
|
"epoch": 0.1873198847262248,
|
|
"grad_norm": 0.9140625,
|
|
"learning_rate": 0.0004999059147931747,
|
|
"loss": 5.665,
|
|
"mean_token_accuracy": 0.1762719616293907,
|
|
"num_tokens": 4468424.0,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"entropy": 5.791493558883667,
|
|
"epoch": 0.1878001921229587,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.0004999049208372629,
|
|
"loss": 5.8694,
|
|
"mean_token_accuracy": 0.15364666059613227,
|
|
"num_tokens": 4479813.0,
|
|
"step": 1955
|
|
},
|
|
{
|
|
"entropy": 5.952554082870483,
|
|
"epoch": 0.1882804995196926,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999039216597267,
|
|
"loss": 5.862,
|
|
"mean_token_accuracy": 0.16733278185129166,
|
|
"num_tokens": 4491172.0,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"entropy": 5.706536293029785,
|
|
"epoch": 0.18876080691642652,
|
|
"grad_norm": 0.92578125,
|
|
"learning_rate": 0.0004999029172605892,
|
|
"loss": 5.7439,
|
|
"mean_token_accuracy": 0.1704375624656677,
|
|
"num_tokens": 4503063.0,
|
|
"step": 1965
|
|
},
|
|
{
|
|
"entropy": 5.889812326431274,
|
|
"epoch": 0.18924111431316043,
|
|
"grad_norm": 0.91796875,
|
|
"learning_rate": 0.0004999019076398738,
|
|
"loss": 5.8177,
|
|
"mean_token_accuracy": 0.15313875377178193,
|
|
"num_tokens": 4514188.0,
|
|
"step": 1970
|
|
},
|
|
{
|
|
"entropy": 5.822384834289551,
|
|
"epoch": 0.18972142170989434,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.000499900892797604,
|
|
"loss": 5.7258,
|
|
"mean_token_accuracy": 0.17310872822999954,
|
|
"num_tokens": 4525293.0,
|
|
"step": 1975
|
|
},
|
|
{
|
|
"entropy": 5.80044903755188,
|
|
"epoch": 0.19020172910662825,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004998998727338031,
|
|
"loss": 5.8139,
|
|
"mean_token_accuracy": 0.1692732721567154,
|
|
"num_tokens": 4536589.0,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"entropy": 5.689789342880249,
|
|
"epoch": 0.19068203650336216,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004998988474484952,
|
|
"loss": 5.5648,
|
|
"mean_token_accuracy": 0.19031796902418135,
|
|
"num_tokens": 4547594.0,
|
|
"step": 1985
|
|
},
|
|
{
|
|
"entropy": 5.717133808135986,
|
|
"epoch": 0.19116234390009607,
|
|
"grad_norm": 0.90625,
|
|
"learning_rate": 0.0004998978169417038,
|
|
"loss": 5.78,
|
|
"mean_token_accuracy": 0.1743384450674057,
|
|
"num_tokens": 4559850.0,
|
|
"step": 1990
|
|
},
|
|
{
|
|
"entropy": 5.791743421554566,
|
|
"epoch": 0.19164265129682997,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004998967812134529,
|
|
"loss": 5.7138,
|
|
"mean_token_accuracy": 0.17110339552164078,
|
|
"num_tokens": 4570727.0,
|
|
"step": 1995
|
|
},
|
|
{
|
|
"entropy": 5.610540056228638,
|
|
"epoch": 0.19212295869356388,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004998957402637664,
|
|
"loss": 5.6542,
|
|
"mean_token_accuracy": 0.17157155871391297,
|
|
"num_tokens": 4582248.0,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"entropy": 5.801579093933105,
|
|
"epoch": 0.1926032660902978,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004998946940926687,
|
|
"loss": 5.6973,
|
|
"mean_token_accuracy": 0.17121600955724717,
|
|
"num_tokens": 4592604.0,
|
|
"step": 2005
|
|
},
|
|
{
|
|
"entropy": 5.661766576766968,
|
|
"epoch": 0.1930835734870317,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000499893642700184,
|
|
"loss": 5.7182,
|
|
"mean_token_accuracy": 0.17020188719034196,
|
|
"num_tokens": 4604398.0,
|
|
"step": 2010
|
|
},
|
|
{
|
|
"entropy": 5.790825366973877,
|
|
"epoch": 0.1935638808837656,
|
|
"grad_norm": 0.921875,
|
|
"learning_rate": 0.0004998925860863368,
|
|
"loss": 5.7931,
|
|
"mean_token_accuracy": 0.1685462474822998,
|
|
"num_tokens": 4616434.0,
|
|
"step": 2015
|
|
},
|
|
{
|
|
"entropy": 5.820285224914551,
|
|
"epoch": 0.19404418828049952,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 0.0004998915242511516,
|
|
"loss": 5.7541,
|
|
"mean_token_accuracy": 0.17625110745429992,
|
|
"num_tokens": 4627577.0,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"entropy": 5.7781401634216305,
|
|
"epoch": 0.19452449567723343,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004998904571946528,
|
|
"loss": 5.817,
|
|
"mean_token_accuracy": 0.16743545606732368,
|
|
"num_tokens": 4639698.0,
|
|
"step": 2025
|
|
},
|
|
{
|
|
"entropy": 5.838766145706177,
|
|
"epoch": 0.19500480307396734,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004998893849168655,
|
|
"loss": 5.8269,
|
|
"mean_token_accuracy": 0.16433341503143312,
|
|
"num_tokens": 4650643.0,
|
|
"step": 2030
|
|
},
|
|
{
|
|
"entropy": 5.762656116485596,
|
|
"epoch": 0.19548511047070125,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.0004998883074178144,
|
|
"loss": 5.7427,
|
|
"mean_token_accuracy": 0.16878412663936615,
|
|
"num_tokens": 4662897.0,
|
|
"step": 2035
|
|
},
|
|
{
|
|
"entropy": 5.818380117416382,
|
|
"epoch": 0.19596541786743515,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004998872246975247,
|
|
"loss": 5.8217,
|
|
"mean_token_accuracy": 0.1706990644335747,
|
|
"num_tokens": 4673701.0,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"entropy": 5.910197305679321,
|
|
"epoch": 0.19644572526416906,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004998861367560213,
|
|
"loss": 5.7826,
|
|
"mean_token_accuracy": 0.16689348816871644,
|
|
"num_tokens": 4685873.0,
|
|
"step": 2045
|
|
},
|
|
{
|
|
"entropy": 5.714930677413941,
|
|
"epoch": 0.19692603266090297,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004998850435933296,
|
|
"loss": 5.6724,
|
|
"mean_token_accuracy": 0.17364383190870286,
|
|
"num_tokens": 4697179.0,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"entropy": 5.752671766281128,
|
|
"epoch": 0.19740634005763688,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004998839452094749,
|
|
"loss": 5.7084,
|
|
"mean_token_accuracy": 0.17288116365671158,
|
|
"num_tokens": 4707752.0,
|
|
"step": 2055
|
|
},
|
|
{
|
|
"entropy": 5.625265073776245,
|
|
"epoch": 0.1978866474543708,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004998828416044829,
|
|
"loss": 5.58,
|
|
"mean_token_accuracy": 0.17766032367944717,
|
|
"num_tokens": 4718413.0,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"entropy": 5.750666522979737,
|
|
"epoch": 0.1983669548511047,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000499881732778379,
|
|
"loss": 5.7696,
|
|
"mean_token_accuracy": 0.16185117661952972,
|
|
"num_tokens": 4730033.0,
|
|
"step": 2065
|
|
},
|
|
{
|
|
"entropy": 5.668474435806274,
|
|
"epoch": 0.1988472622478386,
|
|
"grad_norm": 0.91015625,
|
|
"learning_rate": 0.000499880618731189,
|
|
"loss": 5.6346,
|
|
"mean_token_accuracy": 0.17201206237077712,
|
|
"num_tokens": 4742084.0,
|
|
"step": 2070
|
|
},
|
|
{
|
|
"entropy": 5.801948118209839,
|
|
"epoch": 0.19932756964457252,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004998794994629388,
|
|
"loss": 5.8485,
|
|
"mean_token_accuracy": 0.16415513008832933,
|
|
"num_tokens": 4753885.0,
|
|
"step": 2075
|
|
},
|
|
{
|
|
"entropy": 5.755141353607177,
|
|
"epoch": 0.19980787704130643,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004998783749736545,
|
|
"loss": 5.6852,
|
|
"mean_token_accuracy": 0.17273288518190383,
|
|
"num_tokens": 4765686.0,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"entropy": 5.7318039894104,
|
|
"epoch": 0.20028818443804033,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004998772452633619,
|
|
"loss": 5.7343,
|
|
"mean_token_accuracy": 0.1667577311396599,
|
|
"num_tokens": 4777157.0,
|
|
"step": 2085
|
|
},
|
|
{
|
|
"entropy": 5.734004545211792,
|
|
"epoch": 0.20076849183477424,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004998761103320876,
|
|
"loss": 5.6803,
|
|
"mean_token_accuracy": 0.17569620162248611,
|
|
"num_tokens": 4788583.0,
|
|
"step": 2090
|
|
},
|
|
{
|
|
"entropy": 5.81385350227356,
|
|
"epoch": 0.20124879923150815,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.0004998749701798577,
|
|
"loss": 5.795,
|
|
"mean_token_accuracy": 0.164644692838192,
|
|
"num_tokens": 4800749.0,
|
|
"step": 2095
|
|
},
|
|
{
|
|
"entropy": 5.652225208282471,
|
|
"epoch": 0.2017291066282421,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004998738248066986,
|
|
"loss": 5.7001,
|
|
"mean_token_accuracy": 0.17118856757879258,
|
|
"num_tokens": 4812488.0,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"entropy": 5.816308832168579,
|
|
"epoch": 0.202209414024976,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004998726742126372,
|
|
"loss": 5.6902,
|
|
"mean_token_accuracy": 0.17228334546089172,
|
|
"num_tokens": 4823495.0,
|
|
"step": 2105
|
|
},
|
|
{
|
|
"entropy": 5.622010517120361,
|
|
"epoch": 0.2026897214217099,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004998715183976999,
|
|
"loss": 5.726,
|
|
"mean_token_accuracy": 0.16997579634189605,
|
|
"num_tokens": 4834450.0,
|
|
"step": 2110
|
|
},
|
|
{
|
|
"entropy": 5.763468551635742,
|
|
"epoch": 0.20317002881844382,
|
|
"grad_norm": 0.91796875,
|
|
"learning_rate": 0.0004998703573619137,
|
|
"loss": 5.6443,
|
|
"mean_token_accuracy": 0.18120874017477034,
|
|
"num_tokens": 4846826.0,
|
|
"step": 2115
|
|
},
|
|
{
|
|
"entropy": 5.804740762710571,
|
|
"epoch": 0.20365033621517772,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 0.0004998691911053056,
|
|
"loss": 5.8366,
|
|
"mean_token_accuracy": 0.15913107842206956,
|
|
"num_tokens": 4859668.0,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"entropy": 5.727064418792724,
|
|
"epoch": 0.20413064361191163,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004998680196279026,
|
|
"loss": 5.7049,
|
|
"mean_token_accuracy": 0.17213667631149293,
|
|
"num_tokens": 4871727.0,
|
|
"step": 2125
|
|
},
|
|
{
|
|
"entropy": 5.794467830657959,
|
|
"epoch": 0.20461095100864554,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004998668429297319,
|
|
"loss": 5.7674,
|
|
"mean_token_accuracy": 0.17240212336182595,
|
|
"num_tokens": 4882191.0,
|
|
"step": 2130
|
|
},
|
|
{
|
|
"entropy": 5.760322952270508,
|
|
"epoch": 0.20509125840537945,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004998656610108208,
|
|
"loss": 5.6971,
|
|
"mean_token_accuracy": 0.1685373991727829,
|
|
"num_tokens": 4892416.0,
|
|
"step": 2135
|
|
},
|
|
{
|
|
"entropy": 5.694274854660034,
|
|
"epoch": 0.20557156580211336,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004998644738711969,
|
|
"loss": 5.6674,
|
|
"mean_token_accuracy": 0.1685459852218628,
|
|
"num_tokens": 4903572.0,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"entropy": 5.810105037689209,
|
|
"epoch": 0.20605187319884727,
|
|
"grad_norm": 0.875,
|
|
"learning_rate": 0.0004998632815108874,
|
|
"loss": 5.763,
|
|
"mean_token_accuracy": 0.16395961344242097,
|
|
"num_tokens": 4915417.0,
|
|
"step": 2145
|
|
},
|
|
{
|
|
"entropy": 5.73304591178894,
|
|
"epoch": 0.20653218059558118,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004998620839299203,
|
|
"loss": 5.6495,
|
|
"mean_token_accuracy": 0.17259960770606994,
|
|
"num_tokens": 4926943.0,
|
|
"step": 2150
|
|
},
|
|
{
|
|
"entropy": 5.6710865020751955,
|
|
"epoch": 0.2070124879923151,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004998608811283233,
|
|
"loss": 5.6095,
|
|
"mean_token_accuracy": 0.17803010493516921,
|
|
"num_tokens": 4937724.0,
|
|
"step": 2155
|
|
},
|
|
{
|
|
"entropy": 5.7808784484863285,
|
|
"epoch": 0.207492795389049,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004998596731061244,
|
|
"loss": 5.7756,
|
|
"mean_token_accuracy": 0.16368448734283447,
|
|
"num_tokens": 4949970.0,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"entropy": 5.784394645690918,
|
|
"epoch": 0.2079731027857829,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004998584598633516,
|
|
"loss": 5.774,
|
|
"mean_token_accuracy": 0.16977567672729493,
|
|
"num_tokens": 4961389.0,
|
|
"step": 2165
|
|
},
|
|
{
|
|
"entropy": 5.7822630405426025,
|
|
"epoch": 0.2084534101825168,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004998572414000329,
|
|
"loss": 5.82,
|
|
"mean_token_accuracy": 0.16696709543466567,
|
|
"num_tokens": 4973888.0,
|
|
"step": 2170
|
|
},
|
|
{
|
|
"entropy": 5.75656681060791,
|
|
"epoch": 0.20893371757925072,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004998560177161969,
|
|
"loss": 5.7667,
|
|
"mean_token_accuracy": 0.1604086473584175,
|
|
"num_tokens": 4985423.0,
|
|
"step": 2175
|
|
},
|
|
{
|
|
"entropy": 5.70469822883606,
|
|
"epoch": 0.20941402497598463,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.0004998547888118718,
|
|
"loss": 5.726,
|
|
"mean_token_accuracy": 0.16619897931814193,
|
|
"num_tokens": 4997711.0,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"entropy": 5.7725687503814695,
|
|
"epoch": 0.20989433237271854,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004998535546870862,
|
|
"loss": 5.7454,
|
|
"mean_token_accuracy": 0.1679087519645691,
|
|
"num_tokens": 5009633.0,
|
|
"step": 2185
|
|
},
|
|
{
|
|
"entropy": 5.739374876022339,
|
|
"epoch": 0.21037463976945245,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004998523153418687,
|
|
"loss": 5.6759,
|
|
"mean_token_accuracy": 0.17375072985887527,
|
|
"num_tokens": 5021523.0,
|
|
"step": 2190
|
|
},
|
|
{
|
|
"entropy": 5.785361337661743,
|
|
"epoch": 0.21085494716618636,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004998510707762481,
|
|
"loss": 5.7695,
|
|
"mean_token_accuracy": 0.1699072614312172,
|
|
"num_tokens": 5033513.0,
|
|
"step": 2195
|
|
},
|
|
{
|
|
"entropy": 5.7873194217681885,
|
|
"epoch": 0.21133525456292027,
|
|
"grad_norm": 0.90625,
|
|
"learning_rate": 0.0004998498209902533,
|
|
"loss": 5.7758,
|
|
"mean_token_accuracy": 0.16922611892223358,
|
|
"num_tokens": 5047055.0,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"entropy": 5.707646226882934,
|
|
"epoch": 0.21181556195965417,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004998485659839134,
|
|
"loss": 5.6497,
|
|
"mean_token_accuracy": 0.17682456970214844,
|
|
"num_tokens": 5057613.0,
|
|
"step": 2205
|
|
},
|
|
{
|
|
"entropy": 5.753945970535279,
|
|
"epoch": 0.21229586935638808,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004998473057572575,
|
|
"loss": 5.7615,
|
|
"mean_token_accuracy": 0.16833806186914443,
|
|
"num_tokens": 5068886.0,
|
|
"step": 2210
|
|
},
|
|
{
|
|
"entropy": 5.742906093597412,
|
|
"epoch": 0.212776176753122,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004998460403103146,
|
|
"loss": 5.7494,
|
|
"mean_token_accuracy": 0.16465574279427528,
|
|
"num_tokens": 5079978.0,
|
|
"step": 2215
|
|
},
|
|
{
|
|
"entropy": 5.736083173751831,
|
|
"epoch": 0.2132564841498559,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004998447696431146,
|
|
"loss": 5.7159,
|
|
"mean_token_accuracy": 0.17075446248054504,
|
|
"num_tokens": 5091021.0,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"entropy": 5.6740076541900635,
|
|
"epoch": 0.2137367915465898,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004998434937556865,
|
|
"loss": 5.5988,
|
|
"mean_token_accuracy": 0.181574647128582,
|
|
"num_tokens": 5101483.0,
|
|
"step": 2225
|
|
},
|
|
{
|
|
"entropy": 5.708674907684326,
|
|
"epoch": 0.21421709894332372,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004998422126480602,
|
|
"loss": 5.7447,
|
|
"mean_token_accuracy": 0.16306292563676833,
|
|
"num_tokens": 5113116.0,
|
|
"step": 2230
|
|
},
|
|
{
|
|
"entropy": 5.82704176902771,
|
|
"epoch": 0.21469740634005763,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004998409263202653,
|
|
"loss": 5.6819,
|
|
"mean_token_accuracy": 0.1686948984861374,
|
|
"num_tokens": 5124824.0,
|
|
"step": 2235
|
|
},
|
|
{
|
|
"entropy": 5.589908075332642,
|
|
"epoch": 0.21517771373679154,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004998396347723318,
|
|
"loss": 5.6335,
|
|
"mean_token_accuracy": 0.16587817817926406,
|
|
"num_tokens": 5137567.0,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"entropy": 5.72907018661499,
|
|
"epoch": 0.21565802113352545,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.0004998383380042895,
|
|
"loss": 5.6846,
|
|
"mean_token_accuracy": 0.16729460805654525,
|
|
"num_tokens": 5149016.0,
|
|
"step": 2245
|
|
},
|
|
{
|
|
"entropy": 5.6214783668518065,
|
|
"epoch": 0.21613832853025935,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004998370360161688,
|
|
"loss": 5.5788,
|
|
"mean_token_accuracy": 0.17212725132703782,
|
|
"num_tokens": 5160356.0,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"entropy": 5.79612250328064,
|
|
"epoch": 0.21661863592699326,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004998357288079996,
|
|
"loss": 5.7818,
|
|
"mean_token_accuracy": 0.16184753328561782,
|
|
"num_tokens": 5172100.0,
|
|
"step": 2255
|
|
},
|
|
{
|
|
"entropy": 5.740008592605591,
|
|
"epoch": 0.21709894332372717,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004998344163798125,
|
|
"loss": 5.7405,
|
|
"mean_token_accuracy": 0.16320510655641557,
|
|
"num_tokens": 5183984.0,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"entropy": 5.707123565673828,
|
|
"epoch": 0.21757925072046108,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.0004998330987316379,
|
|
"loss": 5.7153,
|
|
"mean_token_accuracy": 0.167342671751976,
|
|
"num_tokens": 5195853.0,
|
|
"step": 2265
|
|
},
|
|
{
|
|
"entropy": 5.6320737361907955,
|
|
"epoch": 0.21805955811719502,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004998317758635062,
|
|
"loss": 5.5593,
|
|
"mean_token_accuracy": 0.17451774328947067,
|
|
"num_tokens": 5206995.0,
|
|
"step": 2270
|
|
},
|
|
{
|
|
"entropy": 5.515458297729492,
|
|
"epoch": 0.21853986551392893,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004998304477754484,
|
|
"loss": 5.5989,
|
|
"mean_token_accuracy": 0.17679600268602372,
|
|
"num_tokens": 5219291.0,
|
|
"step": 2275
|
|
},
|
|
{
|
|
"entropy": 5.740645408630371,
|
|
"epoch": 0.21902017291066284,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004998291144674952,
|
|
"loss": 5.6885,
|
|
"mean_token_accuracy": 0.17223394364118577,
|
|
"num_tokens": 5230856.0,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"entropy": 5.601490020751953,
|
|
"epoch": 0.21950048030739674,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004998277759396776,
|
|
"loss": 5.5333,
|
|
"mean_token_accuracy": 0.1814967930316925,
|
|
"num_tokens": 5242871.0,
|
|
"step": 2285
|
|
},
|
|
{
|
|
"entropy": 5.656805944442749,
|
|
"epoch": 0.21998078770413065,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004998264321920265,
|
|
"loss": 5.64,
|
|
"mean_token_accuracy": 0.17801354676485062,
|
|
"num_tokens": 5253835.0,
|
|
"step": 2290
|
|
},
|
|
{
|
|
"entropy": 5.676252794265747,
|
|
"epoch": 0.22046109510086456,
|
|
"grad_norm": 0.890625,
|
|
"learning_rate": 0.0004998250832245734,
|
|
"loss": 5.6181,
|
|
"mean_token_accuracy": 0.17702293545007705,
|
|
"num_tokens": 5266195.0,
|
|
"step": 2295
|
|
},
|
|
{
|
|
"entropy": 5.641697740554809,
|
|
"epoch": 0.22094140249759847,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004998237290373494,
|
|
"loss": 5.6002,
|
|
"mean_token_accuracy": 0.1801271617412567,
|
|
"num_tokens": 5277499.0,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"entropy": 5.739913368225098,
|
|
"epoch": 0.22142170989433238,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.000499822369630386,
|
|
"loss": 5.7231,
|
|
"mean_token_accuracy": 0.1597047820687294,
|
|
"num_tokens": 5288622.0,
|
|
"step": 2305
|
|
},
|
|
{
|
|
"entropy": 5.738846015930176,
|
|
"epoch": 0.2219020172910663,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004998210050037148,
|
|
"loss": 5.7816,
|
|
"mean_token_accuracy": 0.16195343434810638,
|
|
"num_tokens": 5299664.0,
|
|
"step": 2310
|
|
},
|
|
{
|
|
"entropy": 5.717037725448608,
|
|
"epoch": 0.2223823246878002,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004998196351573674,
|
|
"loss": 5.6552,
|
|
"mean_token_accuracy": 0.17402878403663635,
|
|
"num_tokens": 5311627.0,
|
|
"step": 2315
|
|
},
|
|
{
|
|
"entropy": 5.5637411117553714,
|
|
"epoch": 0.2228626320845341,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004998182600913757,
|
|
"loss": 5.5627,
|
|
"mean_token_accuracy": 0.17947529554367064,
|
|
"num_tokens": 5323000.0,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"entropy": 5.704880237579346,
|
|
"epoch": 0.22334293948126802,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004998168798057715,
|
|
"loss": 5.5992,
|
|
"mean_token_accuracy": 0.18110302537679673,
|
|
"num_tokens": 5333811.0,
|
|
"step": 2325
|
|
},
|
|
{
|
|
"entropy": 5.615099573135376,
|
|
"epoch": 0.22382324687800192,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000499815494300587,
|
|
"loss": 5.5991,
|
|
"mean_token_accuracy": 0.17574110478162766,
|
|
"num_tokens": 5344762.0,
|
|
"step": 2330
|
|
},
|
|
{
|
|
"entropy": 5.721481513977051,
|
|
"epoch": 0.22430355427473583,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004998141035758542,
|
|
"loss": 5.6195,
|
|
"mean_token_accuracy": 0.17343118488788606,
|
|
"num_tokens": 5356112.0,
|
|
"step": 2335
|
|
},
|
|
{
|
|
"entropy": 5.655849504470825,
|
|
"epoch": 0.22478386167146974,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004998127076316054,
|
|
"loss": 5.7311,
|
|
"mean_token_accuracy": 0.17190437763929367,
|
|
"num_tokens": 5367339.0,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"entropy": 5.674526071548462,
|
|
"epoch": 0.22526416906820365,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004998113064678734,
|
|
"loss": 5.6665,
|
|
"mean_token_accuracy": 0.17564141601324082,
|
|
"num_tokens": 5378627.0,
|
|
"step": 2345
|
|
},
|
|
{
|
|
"entropy": 5.726110649108887,
|
|
"epoch": 0.22574447646493756,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004998099000846901,
|
|
"loss": 5.7012,
|
|
"mean_token_accuracy": 0.1681268870830536,
|
|
"num_tokens": 5390209.0,
|
|
"step": 2350
|
|
},
|
|
{
|
|
"entropy": 5.734390020370483,
|
|
"epoch": 0.22622478386167147,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004998084884820887,
|
|
"loss": 5.6833,
|
|
"mean_token_accuracy": 0.17136491537094117,
|
|
"num_tokens": 5401578.0,
|
|
"step": 2355
|
|
},
|
|
{
|
|
"entropy": 5.615032052993774,
|
|
"epoch": 0.22670509125840538,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004998070716601016,
|
|
"loss": 5.5881,
|
|
"mean_token_accuracy": 0.17977205514907837,
|
|
"num_tokens": 5413831.0,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"entropy": 5.722073316574097,
|
|
"epoch": 0.2271853986551393,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004998056496187618,
|
|
"loss": 5.6496,
|
|
"mean_token_accuracy": 0.1711253985762596,
|
|
"num_tokens": 5425430.0,
|
|
"step": 2365
|
|
},
|
|
{
|
|
"entropy": 5.49839334487915,
|
|
"epoch": 0.2276657060518732,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004998042223581025,
|
|
"loss": 5.4985,
|
|
"mean_token_accuracy": 0.1870403528213501,
|
|
"num_tokens": 5435353.0,
|
|
"step": 2370
|
|
},
|
|
{
|
|
"entropy": 5.7514622688293455,
|
|
"epoch": 0.2281460134486071,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004998027898781565,
|
|
"loss": 5.6991,
|
|
"mean_token_accuracy": 0.17083023190498353,
|
|
"num_tokens": 5446925.0,
|
|
"step": 2375
|
|
},
|
|
{
|
|
"entropy": 5.589994049072265,
|
|
"epoch": 0.228626320845341,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004998013521789574,
|
|
"loss": 5.5899,
|
|
"mean_token_accuracy": 0.1772562175989151,
|
|
"num_tokens": 5456613.0,
|
|
"step": 2380
|
|
},
|
|
{
|
|
"entropy": 5.697564649581909,
|
|
"epoch": 0.22910662824207492,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004997999092605384,
|
|
"loss": 5.6209,
|
|
"mean_token_accuracy": 0.17314212173223495,
|
|
"num_tokens": 5467790.0,
|
|
"step": 2385
|
|
},
|
|
{
|
|
"entropy": 5.672542333602905,
|
|
"epoch": 0.22958693563880883,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.000499798461122933,
|
|
"loss": 5.6065,
|
|
"mean_token_accuracy": 0.17598363608121873,
|
|
"num_tokens": 5479166.0,
|
|
"step": 2390
|
|
},
|
|
{
|
|
"entropy": 5.594286203384399,
|
|
"epoch": 0.23006724303554274,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004997970077661748,
|
|
"loss": 5.5932,
|
|
"mean_token_accuracy": 0.18340873271226882,
|
|
"num_tokens": 5490186.0,
|
|
"step": 2395
|
|
},
|
|
{
|
|
"entropy": 5.690382814407348,
|
|
"epoch": 0.23054755043227665,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004997955491902977,
|
|
"loss": 5.5575,
|
|
"mean_token_accuracy": 0.1718940794467926,
|
|
"num_tokens": 5500416.0,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"entropy": 5.582558584213257,
|
|
"epoch": 0.23102785782901056,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004997940853953354,
|
|
"loss": 5.6489,
|
|
"mean_token_accuracy": 0.17370383739471434,
|
|
"num_tokens": 5512189.0,
|
|
"step": 2405
|
|
},
|
|
{
|
|
"entropy": 5.628128719329834,
|
|
"epoch": 0.23150816522574447,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.000499792616381322,
|
|
"loss": 5.5142,
|
|
"mean_token_accuracy": 0.1828036591410637,
|
|
"num_tokens": 5523631.0,
|
|
"step": 2410
|
|
},
|
|
{
|
|
"entropy": 5.609222555160523,
|
|
"epoch": 0.23198847262247838,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004997911421482914,
|
|
"loss": 5.5763,
|
|
"mean_token_accuracy": 0.1823565348982811,
|
|
"num_tokens": 5535637.0,
|
|
"step": 2415
|
|
},
|
|
{
|
|
"entropy": 5.639013814926147,
|
|
"epoch": 0.23246878001921228,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.000499789662696278,
|
|
"loss": 5.5869,
|
|
"mean_token_accuracy": 0.18035637438297272,
|
|
"num_tokens": 5546470.0,
|
|
"step": 2420
|
|
},
|
|
{
|
|
"entropy": 5.694498586654663,
|
|
"epoch": 0.2329490874159462,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0004997881780253162,
|
|
"loss": 5.7456,
|
|
"mean_token_accuracy": 0.1703657627105713,
|
|
"num_tokens": 5558633.0,
|
|
"step": 2425
|
|
},
|
|
{
|
|
"entropy": 5.6558629989624025,
|
|
"epoch": 0.2334293948126801,
|
|
"grad_norm": 0.875,
|
|
"learning_rate": 0.0004997866881354403,
|
|
"loss": 5.6547,
|
|
"mean_token_accuracy": 0.17033104449510575,
|
|
"num_tokens": 5570427.0,
|
|
"step": 2430
|
|
},
|
|
{
|
|
"entropy": 5.6951744556427,
|
|
"epoch": 0.23390970220941404,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.000499785193026685,
|
|
"loss": 5.6383,
|
|
"mean_token_accuracy": 0.17484120875597,
|
|
"num_tokens": 5580991.0,
|
|
"step": 2435
|
|
},
|
|
{
|
|
"entropy": 5.701549911499024,
|
|
"epoch": 0.23439000960614795,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004997836926990851,
|
|
"loss": 5.6816,
|
|
"mean_token_accuracy": 0.17114701271057128,
|
|
"num_tokens": 5592777.0,
|
|
"step": 2440
|
|
},
|
|
{
|
|
"entropy": 5.602617788314819,
|
|
"epoch": 0.23487031700288186,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004997821871526752,
|
|
"loss": 5.5874,
|
|
"mean_token_accuracy": 0.17974285781383514,
|
|
"num_tokens": 5603326.0,
|
|
"step": 2445
|
|
},
|
|
{
|
|
"entropy": 5.631419324874878,
|
|
"epoch": 0.23535062439961577,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004997806763874905,
|
|
"loss": 5.5697,
|
|
"mean_token_accuracy": 0.1791187435388565,
|
|
"num_tokens": 5614504.0,
|
|
"step": 2450
|
|
},
|
|
{
|
|
"entropy": 5.617094326019287,
|
|
"epoch": 0.23583093179634967,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004997791604035659,
|
|
"loss": 5.6264,
|
|
"mean_token_accuracy": 0.17776354700326918,
|
|
"num_tokens": 5625150.0,
|
|
"step": 2455
|
|
},
|
|
{
|
|
"entropy": 5.6507199764251705,
|
|
"epoch": 0.23631123919308358,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004997776392009366,
|
|
"loss": 5.6458,
|
|
"mean_token_accuracy": 0.169050732254982,
|
|
"num_tokens": 5636815.0,
|
|
"step": 2460
|
|
},
|
|
{
|
|
"entropy": 5.706958866119384,
|
|
"epoch": 0.2367915465898175,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.0004997761127796381,
|
|
"loss": 5.6366,
|
|
"mean_token_accuracy": 0.17092559188604356,
|
|
"num_tokens": 5648272.0,
|
|
"step": 2465
|
|
},
|
|
{
|
|
"entropy": 5.628375577926636,
|
|
"epoch": 0.2372718539865514,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004997745811397056,
|
|
"loss": 5.5463,
|
|
"mean_token_accuracy": 0.17801680713891982,
|
|
"num_tokens": 5659227.0,
|
|
"step": 2470
|
|
},
|
|
{
|
|
"entropy": 5.6414820671081545,
|
|
"epoch": 0.2377521613832853,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004997730442811748,
|
|
"loss": 5.6796,
|
|
"mean_token_accuracy": 0.17399391829967498,
|
|
"num_tokens": 5670411.0,
|
|
"step": 2475
|
|
},
|
|
{
|
|
"entropy": 5.5770539283752445,
|
|
"epoch": 0.23823246878001922,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004997715022040814,
|
|
"loss": 5.5182,
|
|
"mean_token_accuracy": 0.1782184734940529,
|
|
"num_tokens": 5681570.0,
|
|
"step": 2480
|
|
},
|
|
{
|
|
"entropy": 5.523485231399536,
|
|
"epoch": 0.23871277617675313,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.000499769954908461,
|
|
"loss": 5.5022,
|
|
"mean_token_accuracy": 0.1887900114059448,
|
|
"num_tokens": 5693021.0,
|
|
"step": 2485
|
|
},
|
|
{
|
|
"entropy": 5.659896421432495,
|
|
"epoch": 0.23919308357348704,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0004997684023943498,
|
|
"loss": 5.5883,
|
|
"mean_token_accuracy": 0.17428779155015944,
|
|
"num_tokens": 5704043.0,
|
|
"step": 2490
|
|
},
|
|
{
|
|
"entropy": 5.5805792808532715,
|
|
"epoch": 0.23967339097022095,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004997668446617837,
|
|
"loss": 5.6675,
|
|
"mean_token_accuracy": 0.16685750484466552,
|
|
"num_tokens": 5715735.0,
|
|
"step": 2495
|
|
},
|
|
{
|
|
"entropy": 5.760880804061889,
|
|
"epoch": 0.24015369836695485,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004997652817107989,
|
|
"loss": 5.6294,
|
|
"mean_token_accuracy": 0.17232899218797684,
|
|
"num_tokens": 5725778.0,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"entropy": 5.601306343078614,
|
|
"epoch": 0.24063400576368876,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004997637135414315,
|
|
"loss": 5.6628,
|
|
"mean_token_accuracy": 0.17220552116632462,
|
|
"num_tokens": 5737224.0,
|
|
"step": 2505
|
|
},
|
|
{
|
|
"entropy": 5.779234981536865,
|
|
"epoch": 0.24111431316042267,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0004997621401537183,
|
|
"loss": 5.6855,
|
|
"mean_token_accuracy": 0.17120948135852815,
|
|
"num_tokens": 5749226.0,
|
|
"step": 2510
|
|
},
|
|
{
|
|
"entropy": 5.6741156578063965,
|
|
"epoch": 0.24159462055715658,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004997605615476955,
|
|
"loss": 5.6578,
|
|
"mean_token_accuracy": 0.17114464193582535,
|
|
"num_tokens": 5760282.0,
|
|
"step": 2515
|
|
},
|
|
{
|
|
"entropy": 5.539696168899536,
|
|
"epoch": 0.2420749279538905,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.0004997589777234,
|
|
"loss": 5.5633,
|
|
"mean_token_accuracy": 0.181555312871933,
|
|
"num_tokens": 5771756.0,
|
|
"step": 2520
|
|
},
|
|
{
|
|
"entropy": 5.650804233551026,
|
|
"epoch": 0.2425552353506244,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004997573886808684,
|
|
"loss": 5.5835,
|
|
"mean_token_accuracy": 0.16679947078227997,
|
|
"num_tokens": 5783237.0,
|
|
"step": 2525
|
|
},
|
|
{
|
|
"entropy": 5.646309852600098,
|
|
"epoch": 0.2430355427473583,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004997557944201375,
|
|
"loss": 5.6814,
|
|
"mean_token_accuracy": 0.17147036045789718,
|
|
"num_tokens": 5794825.0,
|
|
"step": 2530
|
|
},
|
|
{
|
|
"entropy": 5.675209999084473,
|
|
"epoch": 0.24351585014409222,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004997541949412445,
|
|
"loss": 5.5712,
|
|
"mean_token_accuracy": 0.18625136017799376,
|
|
"num_tokens": 5805578.0,
|
|
"step": 2535
|
|
},
|
|
{
|
|
"entropy": 5.649836206436158,
|
|
"epoch": 0.24399615754082613,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004997525902442266,
|
|
"loss": 5.6738,
|
|
"mean_token_accuracy": 0.16476511359214782,
|
|
"num_tokens": 5818201.0,
|
|
"step": 2540
|
|
},
|
|
{
|
|
"entropy": 5.602812147140503,
|
|
"epoch": 0.24447646493756003,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 0.0004997509803291207,
|
|
"loss": 5.5959,
|
|
"mean_token_accuracy": 0.17587143927812576,
|
|
"num_tokens": 5830319.0,
|
|
"step": 2545
|
|
},
|
|
{
|
|
"entropy": 5.5824614524841305,
|
|
"epoch": 0.24495677233429394,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004997493651959647,
|
|
"loss": 5.5428,
|
|
"mean_token_accuracy": 0.17996817231178283,
|
|
"num_tokens": 5840638.0,
|
|
"step": 2550
|
|
},
|
|
{
|
|
"entropy": 5.66239709854126,
|
|
"epoch": 0.24543707973102785,
|
|
"grad_norm": 0.90625,
|
|
"learning_rate": 0.0004997477448447955,
|
|
"loss": 5.5773,
|
|
"mean_token_accuracy": 0.17367178648710252,
|
|
"num_tokens": 5852472.0,
|
|
"step": 2555
|
|
},
|
|
{
|
|
"entropy": 5.678495073318482,
|
|
"epoch": 0.24591738712776176,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004997461192756512,
|
|
"loss": 5.6133,
|
|
"mean_token_accuracy": 0.170744089782238,
|
|
"num_tokens": 5863455.0,
|
|
"step": 2560
|
|
},
|
|
{
|
|
"entropy": 5.512450170516968,
|
|
"epoch": 0.24639769452449567,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004997444884885694,
|
|
"loss": 5.5251,
|
|
"mean_token_accuracy": 0.17817995101213455,
|
|
"num_tokens": 5873141.0,
|
|
"step": 2565
|
|
},
|
|
{
|
|
"entropy": 5.603986024856567,
|
|
"epoch": 0.24687800192122958,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004997428524835879,
|
|
"loss": 5.6316,
|
|
"mean_token_accuracy": 0.17475323528051376,
|
|
"num_tokens": 5884363.0,
|
|
"step": 2570
|
|
},
|
|
{
|
|
"entropy": 5.740997219085694,
|
|
"epoch": 0.2473583093179635,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0004997412112607446,
|
|
"loss": 5.6721,
|
|
"mean_token_accuracy": 0.17148932665586472,
|
|
"num_tokens": 5895856.0,
|
|
"step": 2575
|
|
},
|
|
{
|
|
"entropy": 5.542859792709351,
|
|
"epoch": 0.2478386167146974,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004997395648200778,
|
|
"loss": 5.4922,
|
|
"mean_token_accuracy": 0.17950474172830583,
|
|
"num_tokens": 5906657.0,
|
|
"step": 2580
|
|
},
|
|
{
|
|
"entropy": 5.600370979309082,
|
|
"epoch": 0.2483189241114313,
|
|
"grad_norm": 0.8984375,
|
|
"learning_rate": 0.0004997379131616257,
|
|
"loss": 5.6226,
|
|
"mean_token_accuracy": 0.1700095072388649,
|
|
"num_tokens": 5919496.0,
|
|
"step": 2585
|
|
},
|
|
{
|
|
"entropy": 5.690901279449463,
|
|
"epoch": 0.24879923150816521,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.0004997362562854266,
|
|
"loss": 5.6843,
|
|
"mean_token_accuracy": 0.16776154488325118,
|
|
"num_tokens": 5932593.0,
|
|
"step": 2590
|
|
},
|
|
{
|
|
"entropy": 5.619813919067383,
|
|
"epoch": 0.24927953890489912,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004997345941915187,
|
|
"loss": 5.6128,
|
|
"mean_token_accuracy": 0.17226099967956543,
|
|
"num_tokens": 5944080.0,
|
|
"step": 2595
|
|
},
|
|
{
|
|
"entropy": 5.602241802215576,
|
|
"epoch": 0.24975984630163303,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004997329268799412,
|
|
"loss": 5.5752,
|
|
"mean_token_accuracy": 0.18460023701190947,
|
|
"num_tokens": 5955703.0,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"entropy": 5.62792739868164,
|
|
"epoch": 0.25024015369836694,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004997312543507322,
|
|
"loss": 5.6565,
|
|
"mean_token_accuracy": 0.1714890867471695,
|
|
"num_tokens": 5966979.0,
|
|
"step": 2605
|
|
},
|
|
{
|
|
"entropy": 5.672908306121826,
|
|
"epoch": 0.2507204610951009,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004997295766039309,
|
|
"loss": 5.545,
|
|
"mean_token_accuracy": 0.17637500017881394,
|
|
"num_tokens": 5978808.0,
|
|
"step": 2610
|
|
},
|
|
{
|
|
"entropy": 5.6401097774505615,
|
|
"epoch": 0.25120076849183476,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.0004997278936395761,
|
|
"loss": 5.7288,
|
|
"mean_token_accuracy": 0.16584430038928985,
|
|
"num_tokens": 5992145.0,
|
|
"step": 2615
|
|
},
|
|
{
|
|
"entropy": 5.665263652801514,
|
|
"epoch": 0.2516810758885687,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004997262054577071,
|
|
"loss": 5.5694,
|
|
"mean_token_accuracy": 0.17564088106155396,
|
|
"num_tokens": 6003723.0,
|
|
"step": 2620
|
|
},
|
|
{
|
|
"entropy": 5.6567973613739015,
|
|
"epoch": 0.2521613832853026,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004997245120583627,
|
|
"loss": 5.6351,
|
|
"mean_token_accuracy": 0.1769047811627388,
|
|
"num_tokens": 6014064.0,
|
|
"step": 2625
|
|
},
|
|
{
|
|
"entropy": 5.53907151222229,
|
|
"epoch": 0.2526416906820365,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.0004997228134415825,
|
|
"loss": 5.5168,
|
|
"mean_token_accuracy": 0.1834915667772293,
|
|
"num_tokens": 6025455.0,
|
|
"step": 2630
|
|
},
|
|
{
|
|
"entropy": 5.6452476501464846,
|
|
"epoch": 0.2531219980787704,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004997211096074059,
|
|
"loss": 5.6231,
|
|
"mean_token_accuracy": 0.16973316073417663,
|
|
"num_tokens": 6037347.0,
|
|
"step": 2635
|
|
},
|
|
{
|
|
"entropy": 5.600665187835693,
|
|
"epoch": 0.25360230547550433,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004997194005558722,
|
|
"loss": 5.5304,
|
|
"mean_token_accuracy": 0.18019532412290573,
|
|
"num_tokens": 6049236.0,
|
|
"step": 2640
|
|
},
|
|
{
|
|
"entropy": 5.534391641616821,
|
|
"epoch": 0.2540826128722382,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0004997176862870216,
|
|
"loss": 5.5339,
|
|
"mean_token_accuracy": 0.1798613414168358,
|
|
"num_tokens": 6060982.0,
|
|
"step": 2645
|
|
},
|
|
{
|
|
"entropy": 5.637931680679321,
|
|
"epoch": 0.25456292026897215,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004997159668008933,
|
|
"loss": 5.5514,
|
|
"mean_token_accuracy": 0.17985030263662338,
|
|
"num_tokens": 6070925.0,
|
|
"step": 2650
|
|
},
|
|
{
|
|
"entropy": 5.526381587982177,
|
|
"epoch": 0.25504322766570603,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004997142420975277,
|
|
"loss": 5.514,
|
|
"mean_token_accuracy": 0.18175738006830217,
|
|
"num_tokens": 6081279.0,
|
|
"step": 2655
|
|
},
|
|
{
|
|
"entropy": 5.5633796691894535,
|
|
"epoch": 0.25552353506243997,
|
|
"grad_norm": 0.91796875,
|
|
"learning_rate": 0.0004997125121769647,
|
|
"loss": 5.6108,
|
|
"mean_token_accuracy": 0.17793446481227876,
|
|
"num_tokens": 6091797.0,
|
|
"step": 2660
|
|
},
|
|
{
|
|
"entropy": 5.687921333312988,
|
|
"epoch": 0.25600384245917385,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 0.0004997107770392444,
|
|
"loss": 5.6134,
|
|
"mean_token_accuracy": 0.1804993599653244,
|
|
"num_tokens": 6103435.0,
|
|
"step": 2665
|
|
},
|
|
{
|
|
"entropy": 5.648722791671753,
|
|
"epoch": 0.2564841498559078,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.000499709036684407,
|
|
"loss": 5.6751,
|
|
"mean_token_accuracy": 0.17587384432554246,
|
|
"num_tokens": 6114531.0,
|
|
"step": 2670
|
|
},
|
|
{
|
|
"entropy": 5.569314622879029,
|
|
"epoch": 0.25696445725264166,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004997072911124932,
|
|
"loss": 5.5173,
|
|
"mean_token_accuracy": 0.17945850938558577,
|
|
"num_tokens": 6126110.0,
|
|
"step": 2675
|
|
},
|
|
{
|
|
"entropy": 5.670061159133911,
|
|
"epoch": 0.2574447646493756,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004997055403235432,
|
|
"loss": 5.6187,
|
|
"mean_token_accuracy": 0.1766670301556587,
|
|
"num_tokens": 6137114.0,
|
|
"step": 2680
|
|
},
|
|
{
|
|
"entropy": 5.62683253288269,
|
|
"epoch": 0.2579250720461095,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004997037843175978,
|
|
"loss": 5.5718,
|
|
"mean_token_accuracy": 0.17658228576183319,
|
|
"num_tokens": 6148696.0,
|
|
"step": 2685
|
|
},
|
|
{
|
|
"entropy": 5.59165620803833,
|
|
"epoch": 0.2584053794428434,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0004997020230946978,
|
|
"loss": 5.568,
|
|
"mean_token_accuracy": 0.1790614068508148,
|
|
"num_tokens": 6160235.0,
|
|
"step": 2690
|
|
},
|
|
{
|
|
"entropy": 5.629477691650391,
|
|
"epoch": 0.25888568683957736,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004997002566548841,
|
|
"loss": 5.5586,
|
|
"mean_token_accuracy": 0.17292713820934297,
|
|
"num_tokens": 6172031.0,
|
|
"step": 2695
|
|
},
|
|
{
|
|
"entropy": 5.48054838180542,
|
|
"epoch": 0.25936599423631124,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004996984849981976,
|
|
"loss": 5.4233,
|
|
"mean_token_accuracy": 0.1893267199397087,
|
|
"num_tokens": 6183547.0,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"entropy": 5.619540548324585,
|
|
"epoch": 0.2598463016330452,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004996967081246794,
|
|
"loss": 5.632,
|
|
"mean_token_accuracy": 0.1678134724497795,
|
|
"num_tokens": 6194768.0,
|
|
"step": 2705
|
|
},
|
|
{
|
|
"entropy": 5.6499683380126955,
|
|
"epoch": 0.26032660902977905,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004996949260343711,
|
|
"loss": 5.6314,
|
|
"mean_token_accuracy": 0.1706198126077652,
|
|
"num_tokens": 6206099.0,
|
|
"step": 2710
|
|
},
|
|
{
|
|
"entropy": 5.624089670181275,
|
|
"epoch": 0.260806916426513,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004996931387273137,
|
|
"loss": 5.6262,
|
|
"mean_token_accuracy": 0.17660144418478013,
|
|
"num_tokens": 6217530.0,
|
|
"step": 2715
|
|
},
|
|
{
|
|
"entropy": 5.713815212249756,
|
|
"epoch": 0.2612872238232469,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.0004996913462035487,
|
|
"loss": 5.6448,
|
|
"mean_token_accuracy": 0.1767139658331871,
|
|
"num_tokens": 6228564.0,
|
|
"step": 2720
|
|
},
|
|
{
|
|
"entropy": 5.539792156219482,
|
|
"epoch": 0.2617675312199808,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.000499689548463118,
|
|
"loss": 5.5174,
|
|
"mean_token_accuracy": 0.17854675203561782,
|
|
"num_tokens": 6239945.0,
|
|
"step": 2725
|
|
},
|
|
{
|
|
"entropy": 5.59919810295105,
|
|
"epoch": 0.2622478386167147,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004996877455060631,
|
|
"loss": 5.6312,
|
|
"mean_token_accuracy": 0.17017472237348558,
|
|
"num_tokens": 6251829.0,
|
|
"step": 2730
|
|
},
|
|
{
|
|
"entropy": 5.7330786228179935,
|
|
"epoch": 0.2627281460134486,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004996859373324259,
|
|
"loss": 5.7264,
|
|
"mean_token_accuracy": 0.16224824339151384,
|
|
"num_tokens": 6264823.0,
|
|
"step": 2735
|
|
},
|
|
{
|
|
"entropy": 5.5701476573944095,
|
|
"epoch": 0.2632084534101825,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004996841239422485,
|
|
"loss": 5.4065,
|
|
"mean_token_accuracy": 0.18482713848352433,
|
|
"num_tokens": 6276247.0,
|
|
"step": 2740
|
|
},
|
|
{
|
|
"entropy": 5.470470857620239,
|
|
"epoch": 0.26368876080691644,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004996823053355729,
|
|
"loss": 5.5321,
|
|
"mean_token_accuracy": 0.18076382875442504,
|
|
"num_tokens": 6287593.0,
|
|
"step": 2745
|
|
},
|
|
{
|
|
"entropy": 5.685536909103393,
|
|
"epoch": 0.2641690682036503,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004996804815124413,
|
|
"loss": 5.6897,
|
|
"mean_token_accuracy": 0.16898608654737474,
|
|
"num_tokens": 6299918.0,
|
|
"step": 2750
|
|
},
|
|
{
|
|
"entropy": 5.568260049819946,
|
|
"epoch": 0.26464937560038426,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004996786524728962,
|
|
"loss": 5.5287,
|
|
"mean_token_accuracy": 0.18196363002061844,
|
|
"num_tokens": 6311147.0,
|
|
"step": 2755
|
|
},
|
|
{
|
|
"entropy": 5.45229320526123,
|
|
"epoch": 0.26512968299711814,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004996768182169797,
|
|
"loss": 5.4564,
|
|
"mean_token_accuracy": 0.18652137070894242,
|
|
"num_tokens": 6323239.0,
|
|
"step": 2760
|
|
},
|
|
{
|
|
"entropy": 5.692247343063355,
|
|
"epoch": 0.2656099903938521,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004996749787447349,
|
|
"loss": 5.5567,
|
|
"mean_token_accuracy": 0.17187336832284927,
|
|
"num_tokens": 6334625.0,
|
|
"step": 2765
|
|
},
|
|
{
|
|
"entropy": 5.545494651794433,
|
|
"epoch": 0.26609029779058596,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000499673134056204,
|
|
"loss": 5.5938,
|
|
"mean_token_accuracy": 0.17517421692609786,
|
|
"num_tokens": 6346068.0,
|
|
"step": 2770
|
|
},
|
|
{
|
|
"entropy": 5.584152412414551,
|
|
"epoch": 0.2665706051873199,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004996712841514303,
|
|
"loss": 5.5716,
|
|
"mean_token_accuracy": 0.17334717959165574,
|
|
"num_tokens": 6357097.0,
|
|
"step": 2775
|
|
},
|
|
{
|
|
"entropy": 5.656313180923462,
|
|
"epoch": 0.2670509125840538,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004996694290304563,
|
|
"loss": 5.6313,
|
|
"mean_token_accuracy": 0.16709280461072923,
|
|
"num_tokens": 6367481.0,
|
|
"step": 2780
|
|
},
|
|
{
|
|
"entropy": 5.52793607711792,
|
|
"epoch": 0.2675312199807877,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004996675686933255,
|
|
"loss": 5.5381,
|
|
"mean_token_accuracy": 0.18144787847995758,
|
|
"num_tokens": 6378873.0,
|
|
"step": 2785
|
|
},
|
|
{
|
|
"entropy": 5.664049291610718,
|
|
"epoch": 0.2680115273775216,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.0004996657031400807,
|
|
"loss": 5.5768,
|
|
"mean_token_accuracy": 0.18006865531206132,
|
|
"num_tokens": 6390651.0,
|
|
"step": 2790
|
|
},
|
|
{
|
|
"entropy": 5.478256464004517,
|
|
"epoch": 0.26849183477425553,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004996638323707655,
|
|
"loss": 5.446,
|
|
"mean_token_accuracy": 0.1820421040058136,
|
|
"num_tokens": 6401631.0,
|
|
"step": 2795
|
|
},
|
|
{
|
|
"entropy": 5.48651123046875,
|
|
"epoch": 0.2689721421709894,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004996619563854232,
|
|
"loss": 5.5308,
|
|
"mean_token_accuracy": 0.1832943469285965,
|
|
"num_tokens": 6413875.0,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"entropy": 5.689049482345581,
|
|
"epoch": 0.26945244956772335,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004996600751840974,
|
|
"loss": 5.5579,
|
|
"mean_token_accuracy": 0.1733505442738533,
|
|
"num_tokens": 6425764.0,
|
|
"step": 2805
|
|
},
|
|
{
|
|
"entropy": 5.478516244888306,
|
|
"epoch": 0.26993275696445723,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004996581887668317,
|
|
"loss": 5.494,
|
|
"mean_token_accuracy": 0.18221275955438615,
|
|
"num_tokens": 6437911.0,
|
|
"step": 2810
|
|
},
|
|
{
|
|
"entropy": 5.534301519393921,
|
|
"epoch": 0.27041306436119117,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00049965629713367,
|
|
"loss": 5.4961,
|
|
"mean_token_accuracy": 0.18141991049051284,
|
|
"num_tokens": 6449942.0,
|
|
"step": 2815
|
|
},
|
|
{
|
|
"entropy": 5.604593276977539,
|
|
"epoch": 0.27089337175792505,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.0004996544002846561,
|
|
"loss": 5.6208,
|
|
"mean_token_accuracy": 0.17682201713323592,
|
|
"num_tokens": 6461729.0,
|
|
"step": 2820
|
|
},
|
|
{
|
|
"entropy": 5.614752101898193,
|
|
"epoch": 0.271373679154659,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004996524982198343,
|
|
"loss": 5.5988,
|
|
"mean_token_accuracy": 0.17795798033475876,
|
|
"num_tokens": 6472046.0,
|
|
"step": 2825
|
|
},
|
|
{
|
|
"entropy": 5.600375080108643,
|
|
"epoch": 0.27185398655139287,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004996505909392485,
|
|
"loss": 5.5667,
|
|
"mean_token_accuracy": 0.17373612523078918,
|
|
"num_tokens": 6483308.0,
|
|
"step": 2830
|
|
},
|
|
{
|
|
"entropy": 5.429362010955811,
|
|
"epoch": 0.2723342939481268,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004996486784429429,
|
|
"loss": 5.4311,
|
|
"mean_token_accuracy": 0.18428465574979783,
|
|
"num_tokens": 6495093.0,
|
|
"step": 2835
|
|
},
|
|
{
|
|
"entropy": 5.5981306552886965,
|
|
"epoch": 0.2728146013448607,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004996467607309622,
|
|
"loss": 5.5307,
|
|
"mean_token_accuracy": 0.17854470163583755,
|
|
"num_tokens": 6505933.0,
|
|
"step": 2840
|
|
},
|
|
{
|
|
"entropy": 5.626583003997803,
|
|
"epoch": 0.2732949087415946,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004996448378033507,
|
|
"loss": 5.5893,
|
|
"mean_token_accuracy": 0.17490534335374833,
|
|
"num_tokens": 6517280.0,
|
|
"step": 2845
|
|
},
|
|
{
|
|
"entropy": 5.60156021118164,
|
|
"epoch": 0.2737752161383285,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004996429096601532,
|
|
"loss": 5.6315,
|
|
"mean_token_accuracy": 0.17191672027111055,
|
|
"num_tokens": 6528980.0,
|
|
"step": 2850
|
|
},
|
|
{
|
|
"entropy": 5.601687097549439,
|
|
"epoch": 0.27425552353506244,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004996409763014144,
|
|
"loss": 5.6235,
|
|
"mean_token_accuracy": 0.17743158787488938,
|
|
"num_tokens": 6540670.0,
|
|
"step": 2855
|
|
},
|
|
{
|
|
"entropy": 5.593181991577149,
|
|
"epoch": 0.2747358309317964,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004996390377271791,
|
|
"loss": 5.5855,
|
|
"mean_token_accuracy": 0.18115401417016982,
|
|
"num_tokens": 6551302.0,
|
|
"step": 2860
|
|
},
|
|
{
|
|
"entropy": 5.5507872104644775,
|
|
"epoch": 0.27521613832853026,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004996370939374924,
|
|
"loss": 5.5433,
|
|
"mean_token_accuracy": 0.1738438919186592,
|
|
"num_tokens": 6563177.0,
|
|
"step": 2865
|
|
},
|
|
{
|
|
"entropy": 5.72943229675293,
|
|
"epoch": 0.2756964457252642,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004996351449323994,
|
|
"loss": 5.6521,
|
|
"mean_token_accuracy": 0.17468605786561966,
|
|
"num_tokens": 6573323.0,
|
|
"step": 2870
|
|
},
|
|
{
|
|
"entropy": 5.5880653858184814,
|
|
"epoch": 0.2761767531219981,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004996331907119455,
|
|
"loss": 5.591,
|
|
"mean_token_accuracy": 0.16756793707609177,
|
|
"num_tokens": 6585382.0,
|
|
"step": 2875
|
|
},
|
|
{
|
|
"entropy": 5.474012231826782,
|
|
"epoch": 0.276657060518732,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004996312312761758,
|
|
"loss": 5.467,
|
|
"mean_token_accuracy": 0.1900227263569832,
|
|
"num_tokens": 6596629.0,
|
|
"step": 2880
|
|
},
|
|
{
|
|
"entropy": 5.6394744396209715,
|
|
"epoch": 0.2771373679154659,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000499629266625136,
|
|
"loss": 5.5734,
|
|
"mean_token_accuracy": 0.17828488498926162,
|
|
"num_tokens": 6608408.0,
|
|
"step": 2885
|
|
},
|
|
{
|
|
"entropy": 5.638094282150268,
|
|
"epoch": 0.27761767531219983,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004996272967588715,
|
|
"loss": 5.5989,
|
|
"mean_token_accuracy": 0.1704651966691017,
|
|
"num_tokens": 6619375.0,
|
|
"step": 2890
|
|
},
|
|
{
|
|
"entropy": 5.618940448760986,
|
|
"epoch": 0.2780979827089337,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004996253216774283,
|
|
"loss": 5.6398,
|
|
"mean_token_accuracy": 0.17304042726755142,
|
|
"num_tokens": 6631317.0,
|
|
"step": 2895
|
|
},
|
|
{
|
|
"entropy": 5.576578378677368,
|
|
"epoch": 0.27857829010566765,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004996233413808521,
|
|
"loss": 5.4904,
|
|
"mean_token_accuracy": 0.18116467744112014,
|
|
"num_tokens": 6642009.0,
|
|
"step": 2900
|
|
},
|
|
{
|
|
"entropy": 5.609902429580688,
|
|
"epoch": 0.27905859750240153,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004996213558691889,
|
|
"loss": 5.6478,
|
|
"mean_token_accuracy": 0.1682332620024681,
|
|
"num_tokens": 6654713.0,
|
|
"step": 2905
|
|
},
|
|
{
|
|
"entropy": 5.651772451400757,
|
|
"epoch": 0.27953890489913547,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0004996193651424848,
|
|
"loss": 5.6064,
|
|
"mean_token_accuracy": 0.17700932323932647,
|
|
"num_tokens": 6667157.0,
|
|
"step": 2910
|
|
},
|
|
{
|
|
"entropy": 5.575735330581665,
|
|
"epoch": 0.28001921229586935,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.000499617369200786,
|
|
"loss": 5.5599,
|
|
"mean_token_accuracy": 0.18871267586946489,
|
|
"num_tokens": 6679573.0,
|
|
"step": 2915
|
|
},
|
|
{
|
|
"entropy": 5.593114852905273,
|
|
"epoch": 0.2804995196926033,
|
|
"grad_norm": 0.859375,
|
|
"learning_rate": 0.0004996153680441389,
|
|
"loss": 5.624,
|
|
"mean_token_accuracy": 0.17413021624088287,
|
|
"num_tokens": 6691768.0,
|
|
"step": 2920
|
|
},
|
|
{
|
|
"entropy": 5.653490257263184,
|
|
"epoch": 0.28097982708933716,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00049961336167259,
|
|
"loss": 5.5864,
|
|
"mean_token_accuracy": 0.17438612282276153,
|
|
"num_tokens": 6701964.0,
|
|
"step": 2925
|
|
},
|
|
{
|
|
"entropy": 5.618965578079224,
|
|
"epoch": 0.2814601344860711,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004996113500861857,
|
|
"loss": 5.5759,
|
|
"mean_token_accuracy": 0.1726679503917694,
|
|
"num_tokens": 6713506.0,
|
|
"step": 2930
|
|
},
|
|
{
|
|
"entropy": 5.581022930145264,
|
|
"epoch": 0.281940441882805,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004996093332849729,
|
|
"loss": 5.593,
|
|
"mean_token_accuracy": 0.1725487932562828,
|
|
"num_tokens": 6724616.0,
|
|
"step": 2935
|
|
},
|
|
{
|
|
"entropy": 5.562248182296753,
|
|
"epoch": 0.2824207492795389,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004996073112689983,
|
|
"loss": 5.5803,
|
|
"mean_token_accuracy": 0.17757243812084197,
|
|
"num_tokens": 6735054.0,
|
|
"step": 2940
|
|
},
|
|
{
|
|
"entropy": 5.616918420791626,
|
|
"epoch": 0.2829010566762728,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0004996052840383088,
|
|
"loss": 5.6325,
|
|
"mean_token_accuracy": 0.17381539791822434,
|
|
"num_tokens": 6746756.0,
|
|
"step": 2945
|
|
},
|
|
{
|
|
"entropy": 5.603857469558716,
|
|
"epoch": 0.28338136407300674,
|
|
"grad_norm": 0.89453125,
|
|
"learning_rate": 0.0004996032515929516,
|
|
"loss": 5.4992,
|
|
"mean_token_accuracy": 0.1776091992855072,
|
|
"num_tokens": 6759566.0,
|
|
"step": 2950
|
|
},
|
|
{
|
|
"entropy": 5.573670148849487,
|
|
"epoch": 0.2838616714697406,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004996012139329738,
|
|
"loss": 5.5225,
|
|
"mean_token_accuracy": 0.17899418324232103,
|
|
"num_tokens": 6771375.0,
|
|
"step": 2955
|
|
},
|
|
{
|
|
"entropy": 5.619125080108643,
|
|
"epoch": 0.28434197886647455,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004995991710584228,
|
|
"loss": 5.6311,
|
|
"mean_token_accuracy": 0.16734524071216583,
|
|
"num_tokens": 6783252.0,
|
|
"step": 2960
|
|
},
|
|
{
|
|
"entropy": 5.58878116607666,
|
|
"epoch": 0.28482228626320844,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.0004995971229693459,
|
|
"loss": 5.5941,
|
|
"mean_token_accuracy": 0.17340553402900696,
|
|
"num_tokens": 6795525.0,
|
|
"step": 2965
|
|
},
|
|
{
|
|
"entropy": 5.610876131057739,
|
|
"epoch": 0.28530259365994237,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 0.0004995950696657909,
|
|
"loss": 5.5353,
|
|
"mean_token_accuracy": 0.17990380227565766,
|
|
"num_tokens": 6807212.0,
|
|
"step": 2970
|
|
},
|
|
{
|
|
"entropy": 5.52398419380188,
|
|
"epoch": 0.28578290105667625,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004995930111478051,
|
|
"loss": 5.4712,
|
|
"mean_token_accuracy": 0.1771505206823349,
|
|
"num_tokens": 6819367.0,
|
|
"step": 2975
|
|
},
|
|
{
|
|
"entropy": 5.5713125705719,
|
|
"epoch": 0.2862632084534102,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004995909474154365,
|
|
"loss": 5.5531,
|
|
"mean_token_accuracy": 0.17791730761528016,
|
|
"num_tokens": 6830405.0,
|
|
"step": 2980
|
|
},
|
|
{
|
|
"entropy": 5.524326038360596,
|
|
"epoch": 0.28674351585014407,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004995888784687331,
|
|
"loss": 5.5413,
|
|
"mean_token_accuracy": 0.18089909702539445,
|
|
"num_tokens": 6841479.0,
|
|
"step": 2985
|
|
},
|
|
{
|
|
"entropy": 5.545838022232056,
|
|
"epoch": 0.287223823246878,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004995868043077428,
|
|
"loss": 5.5784,
|
|
"mean_token_accuracy": 0.1739095240831375,
|
|
"num_tokens": 6851585.0,
|
|
"step": 2990
|
|
},
|
|
{
|
|
"entropy": 5.605233526229858,
|
|
"epoch": 0.2877041306436119,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004995847249325137,
|
|
"loss": 5.5488,
|
|
"mean_token_accuracy": 0.1776391088962555,
|
|
"num_tokens": 6863176.0,
|
|
"step": 2995
|
|
},
|
|
{
|
|
"entropy": 5.596064901351928,
|
|
"epoch": 0.2881844380403458,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004995826403430942,
|
|
"loss": 5.595,
|
|
"mean_token_accuracy": 0.17474860548973084,
|
|
"num_tokens": 6874021.0,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"epoch": 0.2881844380403458,
|
|
"eval_entropy": 5.440896103871502,
|
|
"eval_loss": 5.576871395111084,
|
|
"eval_mean_token_accuracy": 0.18414354559419172,
|
|
"eval_num_tokens": 6874021.0,
|
|
"eval_runtime": 26.9459,
|
|
"eval_samples_per_second": 1217.809,
|
|
"eval_steps_per_second": 152.231,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"entropy": 5.6302040100097654,
|
|
"epoch": 0.2886647454370797,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004995805505395328,
|
|
"loss": 5.5584,
|
|
"mean_token_accuracy": 0.17477040886878967,
|
|
"num_tokens": 6884999.0,
|
|
"step": 3005
|
|
},
|
|
{
|
|
"entropy": 5.559301853179932,
|
|
"epoch": 0.28914505283381364,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004995784555218778,
|
|
"loss": 5.548,
|
|
"mean_token_accuracy": 0.17850742042064666,
|
|
"num_tokens": 6897021.0,
|
|
"step": 3010
|
|
},
|
|
{
|
|
"entropy": 5.518660974502564,
|
|
"epoch": 0.2896253602305475,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004995763552901779,
|
|
"loss": 5.5449,
|
|
"mean_token_accuracy": 0.17909058481454848,
|
|
"num_tokens": 6908320.0,
|
|
"step": 3015
|
|
},
|
|
{
|
|
"entropy": 5.68627028465271,
|
|
"epoch": 0.29010566762728146,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004995742498444818,
|
|
"loss": 5.5342,
|
|
"mean_token_accuracy": 0.18174685835838317,
|
|
"num_tokens": 6919957.0,
|
|
"step": 3020
|
|
},
|
|
{
|
|
"entropy": 5.529996299743653,
|
|
"epoch": 0.2905859750240154,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0004995721391848387,
|
|
"loss": 5.4942,
|
|
"mean_token_accuracy": 0.17575003057718278,
|
|
"num_tokens": 6930531.0,
|
|
"step": 3025
|
|
},
|
|
{
|
|
"entropy": 5.623160696029663,
|
|
"epoch": 0.2910662824207493,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004995700233112972,
|
|
"loss": 5.6325,
|
|
"mean_token_accuracy": 0.17704310566186904,
|
|
"num_tokens": 6942556.0,
|
|
"step": 3030
|
|
},
|
|
{
|
|
"entropy": 5.583187103271484,
|
|
"epoch": 0.2915465898174832,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0004995679022239066,
|
|
"loss": 5.5762,
|
|
"mean_token_accuracy": 0.17900587618350983,
|
|
"num_tokens": 6954410.0,
|
|
"step": 3035
|
|
},
|
|
{
|
|
"entropy": 5.579293632507325,
|
|
"epoch": 0.2920268972142171,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004995657759227162,
|
|
"loss": 5.5857,
|
|
"mean_token_accuracy": 0.17669540643692017,
|
|
"num_tokens": 6964970.0,
|
|
"step": 3040
|
|
},
|
|
{
|
|
"entropy": 5.554018545150757,
|
|
"epoch": 0.29250720461095103,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004995636444077751,
|
|
"loss": 5.4673,
|
|
"mean_token_accuracy": 0.1851392537355423,
|
|
"num_tokens": 6976016.0,
|
|
"step": 3045
|
|
},
|
|
{
|
|
"entropy": 5.490430164337158,
|
|
"epoch": 0.2929875120076849,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004995615076791333,
|
|
"loss": 5.4999,
|
|
"mean_token_accuracy": 0.1816742718219757,
|
|
"num_tokens": 6987199.0,
|
|
"step": 3050
|
|
},
|
|
{
|
|
"entropy": 5.5644313335418705,
|
|
"epoch": 0.29346781940441885,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004995593657368399,
|
|
"loss": 5.5218,
|
|
"mean_token_accuracy": 0.18650518208742142,
|
|
"num_tokens": 6999174.0,
|
|
"step": 3055
|
|
},
|
|
{
|
|
"entropy": 5.557963037490845,
|
|
"epoch": 0.29394812680115273,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.000499557218580945,
|
|
"loss": 5.5884,
|
|
"mean_token_accuracy": 0.17525261044502258,
|
|
"num_tokens": 7012148.0,
|
|
"step": 3060
|
|
},
|
|
{
|
|
"entropy": 5.486077213287354,
|
|
"epoch": 0.29442843419788667,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004995550662114981,
|
|
"loss": 5.4609,
|
|
"mean_token_accuracy": 0.18215615749359132,
|
|
"num_tokens": 7023238.0,
|
|
"step": 3065
|
|
},
|
|
{
|
|
"entropy": 5.561151647567749,
|
|
"epoch": 0.29490874159462055,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004995529086285495,
|
|
"loss": 5.5521,
|
|
"mean_token_accuracy": 0.17758539766073228,
|
|
"num_tokens": 7034944.0,
|
|
"step": 3070
|
|
},
|
|
{
|
|
"entropy": 5.563313627243042,
|
|
"epoch": 0.2953890489913545,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000499550745832149,
|
|
"loss": 5.4154,
|
|
"mean_token_accuracy": 0.18512072116136552,
|
|
"num_tokens": 7046880.0,
|
|
"step": 3075
|
|
},
|
|
{
|
|
"entropy": 5.486554431915283,
|
|
"epoch": 0.29586935638808837,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004995485778223471,
|
|
"loss": 5.4866,
|
|
"mean_token_accuracy": 0.1800946146249771,
|
|
"num_tokens": 7057678.0,
|
|
"step": 3080
|
|
},
|
|
{
|
|
"entropy": 5.4739940643310545,
|
|
"epoch": 0.2963496637848223,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004995464045991939,
|
|
"loss": 5.4688,
|
|
"mean_token_accuracy": 0.18641662895679473,
|
|
"num_tokens": 7068336.0,
|
|
"step": 3085
|
|
},
|
|
{
|
|
"entropy": 5.588371753692627,
|
|
"epoch": 0.2968299711815562,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.00049954422616274,
|
|
"loss": 5.5343,
|
|
"mean_token_accuracy": 0.17594826519489287,
|
|
"num_tokens": 7080341.0,
|
|
"step": 3090
|
|
},
|
|
{
|
|
"entropy": 5.6965454578399655,
|
|
"epoch": 0.2973102785782901,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004995420425130359,
|
|
"loss": 5.6866,
|
|
"mean_token_accuracy": 0.17018966376781464,
|
|
"num_tokens": 7090618.0,
|
|
"step": 3095
|
|
},
|
|
{
|
|
"entropy": 5.499913692474365,
|
|
"epoch": 0.297790585975024,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004995398536501324,
|
|
"loss": 5.4331,
|
|
"mean_token_accuracy": 0.18785624653100969,
|
|
"num_tokens": 7101843.0,
|
|
"step": 3100
|
|
},
|
|
{
|
|
"entropy": 5.4791899681091305,
|
|
"epoch": 0.29827089337175794,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004995376595740801,
|
|
"loss": 5.5056,
|
|
"mean_token_accuracy": 0.18063082695007324,
|
|
"num_tokens": 7112014.0,
|
|
"step": 3105
|
|
},
|
|
{
|
|
"entropy": 5.632973289489746,
|
|
"epoch": 0.2987512007684918,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004995354602849302,
|
|
"loss": 5.5822,
|
|
"mean_token_accuracy": 0.17074308097362517,
|
|
"num_tokens": 7123860.0,
|
|
"step": 3110
|
|
},
|
|
{
|
|
"entropy": 5.571376514434815,
|
|
"epoch": 0.29923150816522576,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004995332557827337,
|
|
"loss": 5.5564,
|
|
"mean_token_accuracy": 0.17600722908973693,
|
|
"num_tokens": 7135901.0,
|
|
"step": 3115
|
|
},
|
|
{
|
|
"entropy": 5.5778998851776125,
|
|
"epoch": 0.29971181556195964,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004995310460675416,
|
|
"loss": 5.5339,
|
|
"mean_token_accuracy": 0.1845734417438507,
|
|
"num_tokens": 7148743.0,
|
|
"step": 3120
|
|
},
|
|
{
|
|
"entropy": 5.589261770248413,
|
|
"epoch": 0.3001921229586936,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004995288311394053,
|
|
"loss": 5.5804,
|
|
"mean_token_accuracy": 0.18021756410598755,
|
|
"num_tokens": 7160731.0,
|
|
"step": 3125
|
|
},
|
|
{
|
|
"entropy": 5.574976587295533,
|
|
"epoch": 0.30067243035542746,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004995266109983764,
|
|
"loss": 5.5617,
|
|
"mean_token_accuracy": 0.17890461087226867,
|
|
"num_tokens": 7172861.0,
|
|
"step": 3130
|
|
},
|
|
{
|
|
"entropy": 5.5695881843566895,
|
|
"epoch": 0.3011527377521614,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004995243856445062,
|
|
"loss": 5.5087,
|
|
"mean_token_accuracy": 0.17425711154937745,
|
|
"num_tokens": 7183954.0,
|
|
"step": 3135
|
|
},
|
|
{
|
|
"entropy": 5.523225164413452,
|
|
"epoch": 0.3016330451488953,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004995221550778466,
|
|
"loss": 5.4793,
|
|
"mean_token_accuracy": 0.1828732267022133,
|
|
"num_tokens": 7195466.0,
|
|
"step": 3140
|
|
},
|
|
{
|
|
"entropy": 5.535993862152099,
|
|
"epoch": 0.3021133525456292,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004995199192984491,
|
|
"loss": 5.4733,
|
|
"mean_token_accuracy": 0.18358256071805953,
|
|
"num_tokens": 7207173.0,
|
|
"step": 3145
|
|
},
|
|
{
|
|
"entropy": 5.601380920410156,
|
|
"epoch": 0.3025936599423631,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004995176783063657,
|
|
"loss": 5.6094,
|
|
"mean_token_accuracy": 0.17880836874246597,
|
|
"num_tokens": 7220095.0,
|
|
"step": 3150
|
|
},
|
|
{
|
|
"entropy": 5.5713316917419435,
|
|
"epoch": 0.30307396733909703,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004995154321016487,
|
|
"loss": 5.5217,
|
|
"mean_token_accuracy": 0.18463317751884462,
|
|
"num_tokens": 7230664.0,
|
|
"step": 3155
|
|
},
|
|
{
|
|
"entropy": 5.5087896347045895,
|
|
"epoch": 0.3035542747358309,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004995131806843499,
|
|
"loss": 5.4837,
|
|
"mean_token_accuracy": 0.18419086784124375,
|
|
"num_tokens": 7241278.0,
|
|
"step": 3160
|
|
},
|
|
{
|
|
"entropy": 5.4533278465271,
|
|
"epoch": 0.30403458213256485,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004995109240545218,
|
|
"loss": 5.6281,
|
|
"mean_token_accuracy": 0.1725993424654007,
|
|
"num_tokens": 7252999.0,
|
|
"step": 3165
|
|
},
|
|
{
|
|
"entropy": 5.589286613464355,
|
|
"epoch": 0.3045148895292987,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004995086622122167,
|
|
"loss": 5.4738,
|
|
"mean_token_accuracy": 0.17775996774435043,
|
|
"num_tokens": 7263949.0,
|
|
"step": 3170
|
|
},
|
|
{
|
|
"entropy": 5.558937978744507,
|
|
"epoch": 0.30499519692603266,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004995063951574871,
|
|
"loss": 5.5219,
|
|
"mean_token_accuracy": 0.18208030313253404,
|
|
"num_tokens": 7275467.0,
|
|
"step": 3175
|
|
},
|
|
{
|
|
"entropy": 5.563764429092407,
|
|
"epoch": 0.30547550432276654,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004995041228903856,
|
|
"loss": 5.4858,
|
|
"mean_token_accuracy": 0.18617523461580276,
|
|
"num_tokens": 7285534.0,
|
|
"step": 3180
|
|
},
|
|
{
|
|
"entropy": 5.614857864379883,
|
|
"epoch": 0.3059558117195005,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000499501845410965,
|
|
"loss": 5.5985,
|
|
"mean_token_accuracy": 0.18059034049510955,
|
|
"num_tokens": 7297252.0,
|
|
"step": 3185
|
|
},
|
|
{
|
|
"entropy": 5.526304435729981,
|
|
"epoch": 0.30643611911623436,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.0004994995627192781,
|
|
"loss": 5.4686,
|
|
"mean_token_accuracy": 0.18378556221723558,
|
|
"num_tokens": 7308492.0,
|
|
"step": 3190
|
|
},
|
|
{
|
|
"entropy": 5.5130932331085205,
|
|
"epoch": 0.3069164265129683,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004994972748153781,
|
|
"loss": 5.5122,
|
|
"mean_token_accuracy": 0.18087892532348632,
|
|
"num_tokens": 7319703.0,
|
|
"step": 3195
|
|
},
|
|
{
|
|
"entropy": 5.598230838775635,
|
|
"epoch": 0.30739673390970224,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000499494981699318,
|
|
"loss": 5.4766,
|
|
"mean_token_accuracy": 0.18629593551158904,
|
|
"num_tokens": 7331022.0,
|
|
"step": 3200
|
|
},
|
|
{
|
|
"entropy": 5.5110736846923825,
|
|
"epoch": 0.3078770413064361,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000499492683371151,
|
|
"loss": 5.5125,
|
|
"mean_token_accuracy": 0.18337176293134688,
|
|
"num_tokens": 7342977.0,
|
|
"step": 3205
|
|
},
|
|
{
|
|
"entropy": 5.602800512313843,
|
|
"epoch": 0.30835734870317005,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004994903798309306,
|
|
"loss": 5.5087,
|
|
"mean_token_accuracy": 0.17746395766735076,
|
|
"num_tokens": 7353227.0,
|
|
"step": 3210
|
|
},
|
|
{
|
|
"entropy": 5.563166570663452,
|
|
"epoch": 0.30883765609990393,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004994880710787102,
|
|
"loss": 5.5743,
|
|
"mean_token_accuracy": 0.1642255187034607,
|
|
"num_tokens": 7364165.0,
|
|
"step": 3215
|
|
},
|
|
{
|
|
"entropy": 5.544680643081665,
|
|
"epoch": 0.30931796349663787,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004994857571145432,
|
|
"loss": 5.5023,
|
|
"mean_token_accuracy": 0.18458254784345626,
|
|
"num_tokens": 7374800.0,
|
|
"step": 3220
|
|
},
|
|
{
|
|
"entropy": 5.425434350967407,
|
|
"epoch": 0.30979827089337175,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004994834379384837,
|
|
"loss": 5.4565,
|
|
"mean_token_accuracy": 0.18336665779352188,
|
|
"num_tokens": 7386360.0,
|
|
"step": 3225
|
|
},
|
|
{
|
|
"entropy": 5.552868223190307,
|
|
"epoch": 0.3102785782901057,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004994811135505851,
|
|
"loss": 5.4698,
|
|
"mean_token_accuracy": 0.18341365456581116,
|
|
"num_tokens": 7397066.0,
|
|
"step": 3230
|
|
},
|
|
{
|
|
"entropy": 5.558938503265381,
|
|
"epoch": 0.31075888568683957,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004994787839509018,
|
|
"loss": 5.564,
|
|
"mean_token_accuracy": 0.1713826075196266,
|
|
"num_tokens": 7408349.0,
|
|
"step": 3235
|
|
},
|
|
{
|
|
"entropy": 5.5813216209411625,
|
|
"epoch": 0.3112391930835735,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004994764491394876,
|
|
"loss": 5.5886,
|
|
"mean_token_accuracy": 0.17263369262218475,
|
|
"num_tokens": 7420343.0,
|
|
"step": 3240
|
|
},
|
|
{
|
|
"entropy": 5.624362230300903,
|
|
"epoch": 0.3117195004803074,
|
|
"grad_norm": 0.921875,
|
|
"learning_rate": 0.0004994741091163969,
|
|
"loss": 5.4904,
|
|
"mean_token_accuracy": 0.18449428975582122,
|
|
"num_tokens": 7431683.0,
|
|
"step": 3245
|
|
},
|
|
{
|
|
"entropy": 5.41058030128479,
|
|
"epoch": 0.3121998078770413,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000499471763881684,
|
|
"loss": 5.4083,
|
|
"mean_token_accuracy": 0.18659997135400772,
|
|
"num_tokens": 7443327.0,
|
|
"step": 3250
|
|
},
|
|
{
|
|
"entropy": 5.545905923843383,
|
|
"epoch": 0.3126801152737752,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004994694134354031,
|
|
"loss": 5.517,
|
|
"mean_token_accuracy": 0.18232496678829194,
|
|
"num_tokens": 7454002.0,
|
|
"step": 3255
|
|
},
|
|
{
|
|
"entropy": 5.49485216140747,
|
|
"epoch": 0.31316042267050914,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.000499467057777609,
|
|
"loss": 5.5092,
|
|
"mean_token_accuracy": 0.18318750262260436,
|
|
"num_tokens": 7464074.0,
|
|
"step": 3260
|
|
},
|
|
{
|
|
"entropy": 5.470322179794311,
|
|
"epoch": 0.313640730067243,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004994646969083565,
|
|
"loss": 5.434,
|
|
"mean_token_accuracy": 0.1871152251958847,
|
|
"num_tokens": 7475543.0,
|
|
"step": 3265
|
|
},
|
|
{
|
|
"entropy": 5.583432674407959,
|
|
"epoch": 0.31412103746397696,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004994623308277002,
|
|
"loss": 5.4947,
|
|
"mean_token_accuracy": 0.18215811550617217,
|
|
"num_tokens": 7486818.0,
|
|
"step": 3270
|
|
},
|
|
{
|
|
"entropy": 5.5460193157196045,
|
|
"epoch": 0.31460134486071084,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.000499459959535695,
|
|
"loss": 5.5431,
|
|
"mean_token_accuracy": 0.17775923311710357,
|
|
"num_tokens": 7499046.0,
|
|
"step": 3275
|
|
},
|
|
{
|
|
"entropy": 5.530418539047242,
|
|
"epoch": 0.3150816522574448,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004994575830323962,
|
|
"loss": 5.4758,
|
|
"mean_token_accuracy": 0.1772423878312111,
|
|
"num_tokens": 7509853.0,
|
|
"step": 3280
|
|
},
|
|
{
|
|
"entropy": 5.422787761688232,
|
|
"epoch": 0.31556195965417866,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004994552013178586,
|
|
"loss": 5.3345,
|
|
"mean_token_accuracy": 0.1908559814095497,
|
|
"num_tokens": 7521091.0,
|
|
"step": 3285
|
|
},
|
|
{
|
|
"entropy": 5.470391035079956,
|
|
"epoch": 0.3160422670509126,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.000499452814392138,
|
|
"loss": 5.4638,
|
|
"mean_token_accuracy": 0.19296756088733674,
|
|
"num_tokens": 7531317.0,
|
|
"step": 3290
|
|
},
|
|
{
|
|
"entropy": 5.550863265991211,
|
|
"epoch": 0.3165225744476465,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004994504222552894,
|
|
"loss": 5.6115,
|
|
"mean_token_accuracy": 0.17447966411709787,
|
|
"num_tokens": 7542822.0,
|
|
"step": 3295
|
|
},
|
|
{
|
|
"entropy": 5.679572725296021,
|
|
"epoch": 0.3170028818443804,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004994480249073684,
|
|
"loss": 5.5371,
|
|
"mean_token_accuracy": 0.17899394482374192,
|
|
"num_tokens": 7552434.0,
|
|
"step": 3300
|
|
},
|
|
{
|
|
"entropy": 5.455837345123291,
|
|
"epoch": 0.3174831892411143,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004994456223484308,
|
|
"loss": 5.412,
|
|
"mean_token_accuracy": 0.1847301483154297,
|
|
"num_tokens": 7563895.0,
|
|
"step": 3305
|
|
},
|
|
{
|
|
"entropy": 5.356154918670654,
|
|
"epoch": 0.31796349663784823,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004994432145785323,
|
|
"loss": 5.4431,
|
|
"mean_token_accuracy": 0.1852705791592598,
|
|
"num_tokens": 7575391.0,
|
|
"step": 3310
|
|
},
|
|
{
|
|
"entropy": 5.603661298751831,
|
|
"epoch": 0.3184438040345821,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004994408015977288,
|
|
"loss": 5.5895,
|
|
"mean_token_accuracy": 0.18396379053592682,
|
|
"num_tokens": 7587119.0,
|
|
"step": 3315
|
|
},
|
|
{
|
|
"entropy": 5.5791820049285885,
|
|
"epoch": 0.31892411143131605,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004994383834060764,
|
|
"loss": 5.5529,
|
|
"mean_token_accuracy": 0.17733592242002488,
|
|
"num_tokens": 7598615.0,
|
|
"step": 3320
|
|
},
|
|
{
|
|
"entropy": 5.522308588027954,
|
|
"epoch": 0.31940441882804993,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004994359600036311,
|
|
"loss": 5.5022,
|
|
"mean_token_accuracy": 0.18452920615673066,
|
|
"num_tokens": 7610159.0,
|
|
"step": 3325
|
|
},
|
|
{
|
|
"entropy": 5.598204278945923,
|
|
"epoch": 0.31988472622478387,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004994335313904493,
|
|
"loss": 5.4916,
|
|
"mean_token_accuracy": 0.18418505936861038,
|
|
"num_tokens": 7620922.0,
|
|
"step": 3330
|
|
},
|
|
{
|
|
"entropy": 5.45703272819519,
|
|
"epoch": 0.32036503362151775,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0004994310975665873,
|
|
"loss": 5.4117,
|
|
"mean_token_accuracy": 0.18754592537879944,
|
|
"num_tokens": 7632343.0,
|
|
"step": 3335
|
|
},
|
|
{
|
|
"entropy": 5.619206094741822,
|
|
"epoch": 0.3208453410182517,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004994286585321017,
|
|
"loss": 5.6097,
|
|
"mean_token_accuracy": 0.1694990485906601,
|
|
"num_tokens": 7644748.0,
|
|
"step": 3340
|
|
},
|
|
{
|
|
"entropy": 5.595988607406616,
|
|
"epoch": 0.32132564841498557,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.000499426214287049,
|
|
"loss": 5.5649,
|
|
"mean_token_accuracy": 0.18684215247631072,
|
|
"num_tokens": 7655449.0,
|
|
"step": 3345
|
|
},
|
|
{
|
|
"entropy": 5.522005844116211,
|
|
"epoch": 0.3218059558117195,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004994237648314862,
|
|
"loss": 5.5274,
|
|
"mean_token_accuracy": 0.18205100297927856,
|
|
"num_tokens": 7665623.0,
|
|
"step": 3350
|
|
},
|
|
{
|
|
"entropy": 5.492083740234375,
|
|
"epoch": 0.3222862632084534,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004994213101654697,
|
|
"loss": 5.4173,
|
|
"mean_token_accuracy": 0.18764639347791673,
|
|
"num_tokens": 7676860.0,
|
|
"step": 3355
|
|
},
|
|
{
|
|
"entropy": 5.5761909008026125,
|
|
"epoch": 0.3227665706051873,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000499418850289057,
|
|
"loss": 5.603,
|
|
"mean_token_accuracy": 0.1757027193903923,
|
|
"num_tokens": 7687778.0,
|
|
"step": 3360
|
|
},
|
|
{
|
|
"entropy": 5.565295886993408,
|
|
"epoch": 0.32324687800192126,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004994163852023048,
|
|
"loss": 5.4981,
|
|
"mean_token_accuracy": 0.18085954636335372,
|
|
"num_tokens": 7699154.0,
|
|
"step": 3365
|
|
},
|
|
{
|
|
"entropy": 5.525069093704223,
|
|
"epoch": 0.32372718539865514,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004994139149052706,
|
|
"loss": 5.5175,
|
|
"mean_token_accuracy": 0.18480815589427949,
|
|
"num_tokens": 7711010.0,
|
|
"step": 3370
|
|
},
|
|
{
|
|
"entropy": 5.576666164398193,
|
|
"epoch": 0.3242074927953891,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004994114393980117,
|
|
"loss": 5.538,
|
|
"mean_token_accuracy": 0.17918068915605545,
|
|
"num_tokens": 7721969.0,
|
|
"step": 3375
|
|
},
|
|
{
|
|
"entropy": 5.561730909347534,
|
|
"epoch": 0.32468780019212296,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004994089586805856,
|
|
"loss": 5.4863,
|
|
"mean_token_accuracy": 0.1827893927693367,
|
|
"num_tokens": 7733762.0,
|
|
"step": 3380
|
|
},
|
|
{
|
|
"entropy": 5.549566268920898,
|
|
"epoch": 0.3251681075888569,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004994064727530496,
|
|
"loss": 5.4963,
|
|
"mean_token_accuracy": 0.17758472561836242,
|
|
"num_tokens": 7744614.0,
|
|
"step": 3385
|
|
},
|
|
{
|
|
"entropy": 5.498316717147827,
|
|
"epoch": 0.3256484149855908,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004994039816154618,
|
|
"loss": 5.4339,
|
|
"mean_token_accuracy": 0.18473347425460815,
|
|
"num_tokens": 7755799.0,
|
|
"step": 3390
|
|
},
|
|
{
|
|
"entropy": 5.455300903320312,
|
|
"epoch": 0.3261287223823247,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00049940148526788,
|
|
"loss": 5.4848,
|
|
"mean_token_accuracy": 0.18304541558027268,
|
|
"num_tokens": 7768140.0,
|
|
"step": 3395
|
|
},
|
|
{
|
|
"entropy": 5.568225574493408,
|
|
"epoch": 0.3266090297790586,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004993989837103618,
|
|
"loss": 5.4898,
|
|
"mean_token_accuracy": 0.1791609227657318,
|
|
"num_tokens": 7778494.0,
|
|
"step": 3400
|
|
},
|
|
{
|
|
"entropy": 5.607134199142456,
|
|
"epoch": 0.3270893371757925,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004993964769429657,
|
|
"loss": 5.5675,
|
|
"mean_token_accuracy": 0.18318891525268555,
|
|
"num_tokens": 7789234.0,
|
|
"step": 3405
|
|
},
|
|
{
|
|
"entropy": 5.541140413284301,
|
|
"epoch": 0.3275696445725264,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.0004993939649657498,
|
|
"loss": 5.548,
|
|
"mean_token_accuracy": 0.18319968730211258,
|
|
"num_tokens": 7800602.0,
|
|
"step": 3410
|
|
},
|
|
{
|
|
"entropy": 5.469655227661133,
|
|
"epoch": 0.32804995196926034,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004993914477787721,
|
|
"loss": 5.3674,
|
|
"mean_token_accuracy": 0.1912238970398903,
|
|
"num_tokens": 7812803.0,
|
|
"step": 3415
|
|
},
|
|
{
|
|
"entropy": 5.625386571884155,
|
|
"epoch": 0.3285302593659942,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004993889253820915,
|
|
"loss": 5.6669,
|
|
"mean_token_accuracy": 0.16849727183580399,
|
|
"num_tokens": 7825432.0,
|
|
"step": 3420
|
|
},
|
|
{
|
|
"entropy": 5.567583656311035,
|
|
"epoch": 0.32901056676272816,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004993863977757663,
|
|
"loss": 5.4819,
|
|
"mean_token_accuracy": 0.18198901265859604,
|
|
"num_tokens": 7837258.0,
|
|
"step": 3425
|
|
},
|
|
{
|
|
"entropy": 5.42762131690979,
|
|
"epoch": 0.32949087415946204,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004993838649598552,
|
|
"loss": 5.3739,
|
|
"mean_token_accuracy": 0.1897459015250206,
|
|
"num_tokens": 7847573.0,
|
|
"step": 3430
|
|
},
|
|
{
|
|
"entropy": 5.551398038864136,
|
|
"epoch": 0.329971181556196,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004993813269344171,
|
|
"loss": 5.4969,
|
|
"mean_token_accuracy": 0.17690201252698898,
|
|
"num_tokens": 7857957.0,
|
|
"step": 3435
|
|
},
|
|
{
|
|
"entropy": 5.5013957023620605,
|
|
"epoch": 0.33045148895292986,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004993787836995108,
|
|
"loss": 5.4174,
|
|
"mean_token_accuracy": 0.1926833838224411,
|
|
"num_tokens": 7867996.0,
|
|
"step": 3440
|
|
},
|
|
{
|
|
"entropy": 5.446499681472778,
|
|
"epoch": 0.3309317963496638,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004993762352551954,
|
|
"loss": 5.4766,
|
|
"mean_token_accuracy": 0.1805843397974968,
|
|
"num_tokens": 7879245.0,
|
|
"step": 3445
|
|
},
|
|
{
|
|
"entropy": 5.61943678855896,
|
|
"epoch": 0.3314121037463977,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004993736816015301,
|
|
"loss": 5.5669,
|
|
"mean_token_accuracy": 0.17582879960536957,
|
|
"num_tokens": 7891186.0,
|
|
"step": 3450
|
|
},
|
|
{
|
|
"entropy": 5.609936046600342,
|
|
"epoch": 0.3318924111431316,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0004993711227385742,
|
|
"loss": 5.5802,
|
|
"mean_token_accuracy": 0.1823540985584259,
|
|
"num_tokens": 7902231.0,
|
|
"step": 3455
|
|
},
|
|
{
|
|
"entropy": 5.523345851898194,
|
|
"epoch": 0.3323727185398655,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004993685586663871,
|
|
"loss": 5.5412,
|
|
"mean_token_accuracy": 0.18139662891626357,
|
|
"num_tokens": 7913364.0,
|
|
"step": 3460
|
|
},
|
|
{
|
|
"entropy": 5.735165405273437,
|
|
"epoch": 0.33285302593659943,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004993659893850281,
|
|
"loss": 5.7308,
|
|
"mean_token_accuracy": 0.16727230101823806,
|
|
"num_tokens": 7925217.0,
|
|
"step": 3465
|
|
},
|
|
{
|
|
"entropy": 5.506084823608399,
|
|
"epoch": 0.3333333333333333,
|
|
"grad_norm": 0.92578125,
|
|
"learning_rate": 0.0004993634148945573,
|
|
"loss": 5.4639,
|
|
"mean_token_accuracy": 0.17894653379917144,
|
|
"num_tokens": 7937636.0,
|
|
"step": 3470
|
|
},
|
|
{
|
|
"entropy": 5.5272363185882565,
|
|
"epoch": 0.33381364073006725,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004993608351950341,
|
|
"loss": 5.4896,
|
|
"mean_token_accuracy": 0.17503666803240775,
|
|
"num_tokens": 7948958.0,
|
|
"step": 3475
|
|
},
|
|
{
|
|
"entropy": 5.620566320419312,
|
|
"epoch": 0.33429394812680113,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004993582502865185,
|
|
"loss": 5.5323,
|
|
"mean_token_accuracy": 0.18402974754571916,
|
|
"num_tokens": 7960013.0,
|
|
"step": 3480
|
|
},
|
|
{
|
|
"entropy": 5.462809419631958,
|
|
"epoch": 0.33477425552353507,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004993556601690706,
|
|
"loss": 5.5416,
|
|
"mean_token_accuracy": 0.17792800366878508,
|
|
"num_tokens": 7971041.0,
|
|
"step": 3485
|
|
},
|
|
{
|
|
"entropy": 5.618744802474976,
|
|
"epoch": 0.33525456292026895,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004993530648427505,
|
|
"loss": 5.576,
|
|
"mean_token_accuracy": 0.1723045140504837,
|
|
"num_tokens": 7982752.0,
|
|
"step": 3490
|
|
},
|
|
{
|
|
"entropy": 5.599891996383667,
|
|
"epoch": 0.3357348703170029,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004993504643076184,
|
|
"loss": 5.4278,
|
|
"mean_token_accuracy": 0.18250093311071397,
|
|
"num_tokens": 7993681.0,
|
|
"step": 3495
|
|
},
|
|
{
|
|
"entropy": 5.470984411239624,
|
|
"epoch": 0.33621517771373677,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004993478585637347,
|
|
"loss": 5.4781,
|
|
"mean_token_accuracy": 0.18258391320705414,
|
|
"num_tokens": 8004727.0,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"entropy": 5.505999660491943,
|
|
"epoch": 0.3366954851104707,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004993452476111599,
|
|
"loss": 5.4797,
|
|
"mean_token_accuracy": 0.18967788219451903,
|
|
"num_tokens": 8015423.0,
|
|
"step": 3505
|
|
},
|
|
{
|
|
"entropy": 5.512713193893433,
|
|
"epoch": 0.3371757925072046,
|
|
"grad_norm": 0.9140625,
|
|
"learning_rate": 0.0004993426314499546,
|
|
"loss": 5.4536,
|
|
"mean_token_accuracy": 0.18748492896556854,
|
|
"num_tokens": 8027911.0,
|
|
"step": 3510
|
|
},
|
|
{
|
|
"entropy": 5.572777605056762,
|
|
"epoch": 0.3376560999039385,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0004993400100801796,
|
|
"loss": 5.4747,
|
|
"mean_token_accuracy": 0.1818804770708084,
|
|
"num_tokens": 8038831.0,
|
|
"step": 3515
|
|
},
|
|
{
|
|
"entropy": 5.392134952545166,
|
|
"epoch": 0.3381364073006724,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004993373835018956,
|
|
"loss": 5.3718,
|
|
"mean_token_accuracy": 0.18957587629556655,
|
|
"num_tokens": 8049906.0,
|
|
"step": 3520
|
|
},
|
|
{
|
|
"entropy": 5.393214273452759,
|
|
"epoch": 0.33861671469740634,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004993347517151638,
|
|
"loss": 5.469,
|
|
"mean_token_accuracy": 0.18386447727680205,
|
|
"num_tokens": 8061158.0,
|
|
"step": 3525
|
|
},
|
|
{
|
|
"entropy": 5.6083544254302975,
|
|
"epoch": 0.3390970220941403,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004993321147200452,
|
|
"loss": 5.4326,
|
|
"mean_token_accuracy": 0.181746444106102,
|
|
"num_tokens": 8071958.0,
|
|
"step": 3530
|
|
},
|
|
{
|
|
"entropy": 5.465584182739258,
|
|
"epoch": 0.33957732949087416,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.000499329472516601,
|
|
"loss": 5.4294,
|
|
"mean_token_accuracy": 0.17608542144298553,
|
|
"num_tokens": 8084068.0,
|
|
"step": 3535
|
|
},
|
|
{
|
|
"entropy": 5.410733461380005,
|
|
"epoch": 0.3400576368876081,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004993268251048925,
|
|
"loss": 5.3472,
|
|
"mean_token_accuracy": 0.19578494429588317,
|
|
"num_tokens": 8096132.0,
|
|
"step": 3540
|
|
},
|
|
{
|
|
"entropy": 5.503920364379883,
|
|
"epoch": 0.340537944284342,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004993241724849814,
|
|
"loss": 5.5102,
|
|
"mean_token_accuracy": 0.18362511545419694,
|
|
"num_tokens": 8107327.0,
|
|
"step": 3545
|
|
},
|
|
{
|
|
"entropy": 5.497963953018188,
|
|
"epoch": 0.3410182516810759,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000499321514656929,
|
|
"loss": 5.4779,
|
|
"mean_token_accuracy": 0.18374822586774825,
|
|
"num_tokens": 8118584.0,
|
|
"step": 3550
|
|
},
|
|
{
|
|
"entropy": 5.550964641571045,
|
|
"epoch": 0.3414985590778098,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004993188516207972,
|
|
"loss": 5.5337,
|
|
"mean_token_accuracy": 0.1793607845902443,
|
|
"num_tokens": 8130081.0,
|
|
"step": 3555
|
|
},
|
|
{
|
|
"entropy": 5.507245492935181,
|
|
"epoch": 0.34197886647454373,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004993161833766478,
|
|
"loss": 5.4932,
|
|
"mean_token_accuracy": 0.1838148668408394,
|
|
"num_tokens": 8141463.0,
|
|
"step": 3560
|
|
},
|
|
{
|
|
"entropy": 5.541257572174072,
|
|
"epoch": 0.3424591738712776,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004993135099245426,
|
|
"loss": 5.5042,
|
|
"mean_token_accuracy": 0.17985130697488785,
|
|
"num_tokens": 8153863.0,
|
|
"step": 3565
|
|
},
|
|
{
|
|
"entropy": 5.428792333602905,
|
|
"epoch": 0.34293948126801155,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004993108312645438,
|
|
"loss": 5.463,
|
|
"mean_token_accuracy": 0.18102106750011443,
|
|
"num_tokens": 8165695.0,
|
|
"step": 3570
|
|
},
|
|
{
|
|
"entropy": 5.5374926090240475,
|
|
"epoch": 0.34341978866474543,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004993081473967135,
|
|
"loss": 5.5119,
|
|
"mean_token_accuracy": 0.18098655641078948,
|
|
"num_tokens": 8176456.0,
|
|
"step": 3575
|
|
},
|
|
{
|
|
"entropy": 5.58543210029602,
|
|
"epoch": 0.34390009606147937,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004993054583211143,
|
|
"loss": 5.5092,
|
|
"mean_token_accuracy": 0.1822955548763275,
|
|
"num_tokens": 8189050.0,
|
|
"step": 3580
|
|
},
|
|
{
|
|
"entropy": 5.43015308380127,
|
|
"epoch": 0.34438040345821325,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.0004993027640378081,
|
|
"loss": 5.4081,
|
|
"mean_token_accuracy": 0.185765840113163,
|
|
"num_tokens": 8200011.0,
|
|
"step": 3585
|
|
},
|
|
{
|
|
"entropy": 5.474026918411255,
|
|
"epoch": 0.3448607108549472,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000499300064546858,
|
|
"loss": 5.4183,
|
|
"mean_token_accuracy": 0.1868817389011383,
|
|
"num_tokens": 8211770.0,
|
|
"step": 3590
|
|
},
|
|
{
|
|
"entropy": 5.55191330909729,
|
|
"epoch": 0.34534101825168106,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004992973598483264,
|
|
"loss": 5.4638,
|
|
"mean_token_accuracy": 0.18688549250364303,
|
|
"num_tokens": 8223582.0,
|
|
"step": 3595
|
|
},
|
|
{
|
|
"entropy": 5.575275611877442,
|
|
"epoch": 0.345821325648415,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000499294649942276,
|
|
"loss": 5.5846,
|
|
"mean_token_accuracy": 0.1825041502714157,
|
|
"num_tokens": 8234336.0,
|
|
"step": 3600
|
|
},
|
|
{
|
|
"entropy": 5.547464847564697,
|
|
"epoch": 0.3463016330451489,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004992919348287699,
|
|
"loss": 5.4941,
|
|
"mean_token_accuracy": 0.18366153985261918,
|
|
"num_tokens": 8244605.0,
|
|
"step": 3605
|
|
},
|
|
{
|
|
"entropy": 5.5259942531585695,
|
|
"epoch": 0.3467819404418828,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004992892145078711,
|
|
"loss": 5.5254,
|
|
"mean_token_accuracy": 0.17931086868047713,
|
|
"num_tokens": 8255876.0,
|
|
"step": 3610
|
|
},
|
|
{
|
|
"entropy": 5.4697678565979,
|
|
"epoch": 0.3472622478386167,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004992864889796427,
|
|
"loss": 5.4174,
|
|
"mean_token_accuracy": 0.18721913993358613,
|
|
"num_tokens": 8266602.0,
|
|
"step": 3615
|
|
},
|
|
{
|
|
"entropy": 5.546818780899048,
|
|
"epoch": 0.34774255523535064,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0004992837582441481,
|
|
"loss": 5.4216,
|
|
"mean_token_accuracy": 0.18347607105970382,
|
|
"num_tokens": 8279804.0,
|
|
"step": 3620
|
|
},
|
|
{
|
|
"entropy": 5.569514989852905,
|
|
"epoch": 0.3482228626320845,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004992810223014506,
|
|
"loss": 5.5242,
|
|
"mean_token_accuracy": 0.1833881989121437,
|
|
"num_tokens": 8291020.0,
|
|
"step": 3625
|
|
},
|
|
{
|
|
"entropy": 5.5203827857971195,
|
|
"epoch": 0.34870317002881845,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004992782811516137,
|
|
"loss": 5.4727,
|
|
"mean_token_accuracy": 0.18729409873485564,
|
|
"num_tokens": 8302192.0,
|
|
"step": 3630
|
|
},
|
|
{
|
|
"entropy": 5.496627855300903,
|
|
"epoch": 0.34918347742555234,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0004992755347947011,
|
|
"loss": 5.4324,
|
|
"mean_token_accuracy": 0.18265776634216307,
|
|
"num_tokens": 8313649.0,
|
|
"step": 3635
|
|
},
|
|
{
|
|
"entropy": 5.44870662689209,
|
|
"epoch": 0.34966378482228627,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004992727832307766,
|
|
"loss": 5.4304,
|
|
"mean_token_accuracy": 0.18587879687547684,
|
|
"num_tokens": 8324694.0,
|
|
"step": 3640
|
|
},
|
|
{
|
|
"entropy": 5.604543972015381,
|
|
"epoch": 0.35014409221902015,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004992700264599039,
|
|
"loss": 5.594,
|
|
"mean_token_accuracy": 0.1727964922785759,
|
|
"num_tokens": 8336517.0,
|
|
"step": 3645
|
|
},
|
|
{
|
|
"entropy": 5.540855789184571,
|
|
"epoch": 0.3506243996157541,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004992672644821473,
|
|
"loss": 5.5425,
|
|
"mean_token_accuracy": 0.1779757022857666,
|
|
"num_tokens": 8349001.0,
|
|
"step": 3650
|
|
},
|
|
{
|
|
"entropy": 5.5626523971557615,
|
|
"epoch": 0.35110470701248797,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004992644972975707,
|
|
"loss": 5.4537,
|
|
"mean_token_accuracy": 0.1864044651389122,
|
|
"num_tokens": 8361230.0,
|
|
"step": 3655
|
|
},
|
|
{
|
|
"entropy": 5.394788694381714,
|
|
"epoch": 0.3515850144092219,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004992617249062383,
|
|
"loss": 5.3924,
|
|
"mean_token_accuracy": 0.19216873198747636,
|
|
"num_tokens": 8372159.0,
|
|
"step": 3660
|
|
},
|
|
{
|
|
"entropy": 5.543751049041748,
|
|
"epoch": 0.3520653218059558,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004992589473082147,
|
|
"loss": 5.5214,
|
|
"mean_token_accuracy": 0.18608528524637222,
|
|
"num_tokens": 8383228.0,
|
|
"step": 3665
|
|
},
|
|
{
|
|
"entropy": 5.509809923171997,
|
|
"epoch": 0.3525456292026897,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004992561645035641,
|
|
"loss": 5.4561,
|
|
"mean_token_accuracy": 0.18168068826198577,
|
|
"num_tokens": 8394582.0,
|
|
"step": 3670
|
|
},
|
|
{
|
|
"entropy": 5.514116191864014,
|
|
"epoch": 0.3530259365994236,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004992533764923515,
|
|
"loss": 5.4481,
|
|
"mean_token_accuracy": 0.18126334249973297,
|
|
"num_tokens": 8406784.0,
|
|
"step": 3675
|
|
},
|
|
{
|
|
"entropy": 5.483726072311401,
|
|
"epoch": 0.35350624399615754,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004992505832746412,
|
|
"loss": 5.4286,
|
|
"mean_token_accuracy": 0.19101243019104003,
|
|
"num_tokens": 8418405.0,
|
|
"step": 3680
|
|
},
|
|
{
|
|
"entropy": 5.5265562534332275,
|
|
"epoch": 0.3539865513928914,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004992477848504983,
|
|
"loss": 5.392,
|
|
"mean_token_accuracy": 0.18716304898262023,
|
|
"num_tokens": 8430432.0,
|
|
"step": 3685
|
|
},
|
|
{
|
|
"entropy": 5.479315328598022,
|
|
"epoch": 0.35446685878962536,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.0004992449812199877,
|
|
"loss": 5.5635,
|
|
"mean_token_accuracy": 0.17799893915653228,
|
|
"num_tokens": 8442423.0,
|
|
"step": 3690
|
|
},
|
|
{
|
|
"entropy": 5.518668079376221,
|
|
"epoch": 0.3549471661863593,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004992421723831745,
|
|
"loss": 5.546,
|
|
"mean_token_accuracy": 0.1842621758580208,
|
|
"num_tokens": 8454951.0,
|
|
"step": 3695
|
|
},
|
|
{
|
|
"entropy": 5.520323848724365,
|
|
"epoch": 0.3554274735830932,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004992393583401239,
|
|
"loss": 5.4033,
|
|
"mean_token_accuracy": 0.18851898312568666,
|
|
"num_tokens": 8467758.0,
|
|
"step": 3700
|
|
},
|
|
{
|
|
"entropy": 5.475191354751587,
|
|
"epoch": 0.3559077809798271,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004992365390909014,
|
|
"loss": 5.4854,
|
|
"mean_token_accuracy": 0.17992179691791535,
|
|
"num_tokens": 8479728.0,
|
|
"step": 3705
|
|
},
|
|
{
|
|
"entropy": 5.535838651657104,
|
|
"epoch": 0.356388088376561,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004992337146355721,
|
|
"loss": 5.552,
|
|
"mean_token_accuracy": 0.17727553099393845,
|
|
"num_tokens": 8492099.0,
|
|
"step": 3710
|
|
},
|
|
{
|
|
"entropy": 5.610863542556762,
|
|
"epoch": 0.35686839577329493,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004992308849742019,
|
|
"loss": 5.4819,
|
|
"mean_token_accuracy": 0.17355056405067443,
|
|
"num_tokens": 8504657.0,
|
|
"step": 3715
|
|
},
|
|
{
|
|
"entropy": 5.48232364654541,
|
|
"epoch": 0.3573487031700288,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004992280501068563,
|
|
"loss": 5.4509,
|
|
"mean_token_accuracy": 0.18914830237627028,
|
|
"num_tokens": 8514728.0,
|
|
"step": 3720
|
|
},
|
|
{
|
|
"entropy": 5.528886175155639,
|
|
"epoch": 0.35782901056676275,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004992252100336012,
|
|
"loss": 5.581,
|
|
"mean_token_accuracy": 0.1833130970597267,
|
|
"num_tokens": 8525588.0,
|
|
"step": 3725
|
|
},
|
|
{
|
|
"entropy": 5.540911626815796,
|
|
"epoch": 0.35830931796349663,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004992223647545027,
|
|
"loss": 5.527,
|
|
"mean_token_accuracy": 0.18297800421714783,
|
|
"num_tokens": 8537468.0,
|
|
"step": 3730
|
|
},
|
|
{
|
|
"entropy": 5.5527503490448,
|
|
"epoch": 0.35878962536023057,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004992195142696266,
|
|
"loss": 5.438,
|
|
"mean_token_accuracy": 0.18914629518985748,
|
|
"num_tokens": 8548598.0,
|
|
"step": 3735
|
|
},
|
|
{
|
|
"entropy": 5.33068585395813,
|
|
"epoch": 0.35926993275696445,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004992166585790391,
|
|
"loss": 5.3396,
|
|
"mean_token_accuracy": 0.19562919437885284,
|
|
"num_tokens": 8560301.0,
|
|
"step": 3740
|
|
},
|
|
{
|
|
"entropy": 5.483434391021729,
|
|
"epoch": 0.3597502401536984,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004992137976828067,
|
|
"loss": 5.4516,
|
|
"mean_token_accuracy": 0.18603197634220123,
|
|
"num_tokens": 8571186.0,
|
|
"step": 3745
|
|
},
|
|
{
|
|
"entropy": 5.484015607833863,
|
|
"epoch": 0.36023054755043227,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004992109315809955,
|
|
"loss": 5.4383,
|
|
"mean_token_accuracy": 0.18905191421508788,
|
|
"num_tokens": 8580725.0,
|
|
"step": 3750
|
|
},
|
|
{
|
|
"entropy": 5.519361686706543,
|
|
"epoch": 0.3607108549471662,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.0004992080602736725,
|
|
"loss": 5.5532,
|
|
"mean_token_accuracy": 0.1773756206035614,
|
|
"num_tokens": 8594598.0,
|
|
"step": 3755
|
|
},
|
|
{
|
|
"entropy": 5.643574905395508,
|
|
"epoch": 0.3611911623439001,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004992051837609039,
|
|
"loss": 5.5404,
|
|
"mean_token_accuracy": 0.17730522602796556,
|
|
"num_tokens": 8606733.0,
|
|
"step": 3760
|
|
},
|
|
{
|
|
"entropy": 5.508514451980591,
|
|
"epoch": 0.361671469740634,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004992023020427568,
|
|
"loss": 5.4788,
|
|
"mean_token_accuracy": 0.18672696501016617,
|
|
"num_tokens": 8618863.0,
|
|
"step": 3765
|
|
},
|
|
{
|
|
"entropy": 5.3892511367797855,
|
|
"epoch": 0.3621517771373679,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004991994151192979,
|
|
"loss": 5.3304,
|
|
"mean_token_accuracy": 0.18849435597658157,
|
|
"num_tokens": 8629270.0,
|
|
"step": 3770
|
|
},
|
|
{
|
|
"entropy": 5.4767759323120115,
|
|
"epoch": 0.36263208453410184,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004991965229905943,
|
|
"loss": 5.5364,
|
|
"mean_token_accuracy": 0.18494855612516403,
|
|
"num_tokens": 8641363.0,
|
|
"step": 3775
|
|
},
|
|
{
|
|
"entropy": 5.6278270244598385,
|
|
"epoch": 0.3631123919308357,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004991936256567133,
|
|
"loss": 5.4992,
|
|
"mean_token_accuracy": 0.18451761305332184,
|
|
"num_tokens": 8653233.0,
|
|
"step": 3780
|
|
},
|
|
{
|
|
"entropy": 5.4851010799407955,
|
|
"epoch": 0.36359269932756966,
|
|
"grad_norm": 0.91015625,
|
|
"learning_rate": 0.000499190723117722,
|
|
"loss": 5.487,
|
|
"mean_token_accuracy": 0.17836329340934753,
|
|
"num_tokens": 8665192.0,
|
|
"step": 3785
|
|
},
|
|
{
|
|
"entropy": 5.579302835464477,
|
|
"epoch": 0.36407300672430354,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004991878153736877,
|
|
"loss": 5.5583,
|
|
"mean_token_accuracy": 0.17446503937244415,
|
|
"num_tokens": 8677669.0,
|
|
"step": 3790
|
|
},
|
|
{
|
|
"entropy": 5.419927787780762,
|
|
"epoch": 0.3645533141210375,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004991849024246781,
|
|
"loss": 5.3676,
|
|
"mean_token_accuracy": 0.18973670154809952,
|
|
"num_tokens": 8688002.0,
|
|
"step": 3795
|
|
},
|
|
{
|
|
"entropy": 5.438193988800049,
|
|
"epoch": 0.36503362151777136,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004991819842707608,
|
|
"loss": 5.4133,
|
|
"mean_token_accuracy": 0.18962489068508148,
|
|
"num_tokens": 8698396.0,
|
|
"step": 3800
|
|
},
|
|
{
|
|
"entropy": 5.543167686462402,
|
|
"epoch": 0.3655139289145053,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004991790609120035,
|
|
"loss": 5.4297,
|
|
"mean_token_accuracy": 0.18700562715530394,
|
|
"num_tokens": 8711135.0,
|
|
"step": 3805
|
|
},
|
|
{
|
|
"entropy": 5.469641494750976,
|
|
"epoch": 0.3659942363112392,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000499176132348474,
|
|
"loss": 5.4735,
|
|
"mean_token_accuracy": 0.1897922232747078,
|
|
"num_tokens": 8723707.0,
|
|
"step": 3810
|
|
},
|
|
{
|
|
"entropy": 5.582857084274292,
|
|
"epoch": 0.3664745437079731,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004991731985802405,
|
|
"loss": 5.4338,
|
|
"mean_token_accuracy": 0.18693850934505463,
|
|
"num_tokens": 8734193.0,
|
|
"step": 3815
|
|
},
|
|
{
|
|
"entropy": 5.444149160385132,
|
|
"epoch": 0.366954851104707,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004991702596073708,
|
|
"loss": 5.4841,
|
|
"mean_token_accuracy": 0.18134361505508423,
|
|
"num_tokens": 8745619.0,
|
|
"step": 3820
|
|
},
|
|
{
|
|
"entropy": 5.426347923278809,
|
|
"epoch": 0.36743515850144093,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004991673154299335,
|
|
"loss": 5.4231,
|
|
"mean_token_accuracy": 0.18122087568044662,
|
|
"num_tokens": 8757331.0,
|
|
"step": 3825
|
|
},
|
|
{
|
|
"entropy": 5.515204238891601,
|
|
"epoch": 0.3679154658981748,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004991643660479967,
|
|
"loss": 5.428,
|
|
"mean_token_accuracy": 0.1868494287133217,
|
|
"num_tokens": 8768840.0,
|
|
"step": 3830
|
|
},
|
|
{
|
|
"entropy": 5.460073804855346,
|
|
"epoch": 0.36839577329490875,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004991614114616289,
|
|
"loss": 5.3818,
|
|
"mean_token_accuracy": 0.18779707103967666,
|
|
"num_tokens": 8781214.0,
|
|
"step": 3835
|
|
},
|
|
{
|
|
"entropy": 5.510246324539184,
|
|
"epoch": 0.3688760806916426,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004991584516708988,
|
|
"loss": 5.4477,
|
|
"mean_token_accuracy": 0.18548956960439683,
|
|
"num_tokens": 8791645.0,
|
|
"step": 3840
|
|
},
|
|
{
|
|
"entropy": 5.5942995071411135,
|
|
"epoch": 0.36935638808837656,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004991554866758751,
|
|
"loss": 5.6333,
|
|
"mean_token_accuracy": 0.1739022307097912,
|
|
"num_tokens": 8803286.0,
|
|
"step": 3845
|
|
},
|
|
{
|
|
"entropy": 5.493673467636109,
|
|
"epoch": 0.36983669548511044,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004991525164766265,
|
|
"loss": 5.4163,
|
|
"mean_token_accuracy": 0.1872221603989601,
|
|
"num_tokens": 8814207.0,
|
|
"step": 3850
|
|
},
|
|
{
|
|
"entropy": 5.503255462646484,
|
|
"epoch": 0.3703170028818444,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0004991495410732222,
|
|
"loss": 5.4683,
|
|
"mean_token_accuracy": 0.17725101560354234,
|
|
"num_tokens": 8825540.0,
|
|
"step": 3855
|
|
},
|
|
{
|
|
"entropy": 5.5069482803344725,
|
|
"epoch": 0.37079731027857826,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004991465604657311,
|
|
"loss": 5.5937,
|
|
"mean_token_accuracy": 0.17322031259536744,
|
|
"num_tokens": 8838182.0,
|
|
"step": 3860
|
|
},
|
|
{
|
|
"entropy": 5.526088094711303,
|
|
"epoch": 0.3712776176753122,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004991435746542224,
|
|
"loss": 5.4654,
|
|
"mean_token_accuracy": 0.18988653868436814,
|
|
"num_tokens": 8850211.0,
|
|
"step": 3865
|
|
},
|
|
{
|
|
"entropy": 5.439452648162842,
|
|
"epoch": 0.37175792507204614,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004991405836387655,
|
|
"loss": 5.5032,
|
|
"mean_token_accuracy": 0.18108827471733094,
|
|
"num_tokens": 8862804.0,
|
|
"step": 3870
|
|
},
|
|
{
|
|
"entropy": 5.529762125015258,
|
|
"epoch": 0.37223823246878,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004991375874194298,
|
|
"loss": 5.4602,
|
|
"mean_token_accuracy": 0.17960784435272217,
|
|
"num_tokens": 8874112.0,
|
|
"step": 3875
|
|
},
|
|
{
|
|
"entropy": 5.469674205780029,
|
|
"epoch": 0.37271853986551395,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000499134585996285,
|
|
"loss": 5.477,
|
|
"mean_token_accuracy": 0.18614101260900498,
|
|
"num_tokens": 8885114.0,
|
|
"step": 3880
|
|
},
|
|
{
|
|
"entropy": 5.554774141311645,
|
|
"epoch": 0.37319884726224783,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004991315793694004,
|
|
"loss": 5.3691,
|
|
"mean_token_accuracy": 0.18807282894849778,
|
|
"num_tokens": 8895555.0,
|
|
"step": 3885
|
|
},
|
|
{
|
|
"entropy": 5.405085754394531,
|
|
"epoch": 0.37367915465898177,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004991285675388463,
|
|
"loss": 5.3765,
|
|
"mean_token_accuracy": 0.19634046405553818,
|
|
"num_tokens": 8906073.0,
|
|
"step": 3890
|
|
},
|
|
{
|
|
"entropy": 5.501630163192749,
|
|
"epoch": 0.37415946205571565,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004991255505046922,
|
|
"loss": 5.5188,
|
|
"mean_token_accuracy": 0.1789945885539055,
|
|
"num_tokens": 8916587.0,
|
|
"step": 3895
|
|
},
|
|
{
|
|
"entropy": 5.550557231903076,
|
|
"epoch": 0.3746397694524496,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004991225282670083,
|
|
"loss": 5.4113,
|
|
"mean_token_accuracy": 0.1861289381980896,
|
|
"num_tokens": 8927923.0,
|
|
"step": 3900
|
|
},
|
|
{
|
|
"entropy": 5.382868242263794,
|
|
"epoch": 0.37512007684918347,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.000499119500825865,
|
|
"loss": 5.4579,
|
|
"mean_token_accuracy": 0.18377629071474075,
|
|
"num_tokens": 8939939.0,
|
|
"step": 3905
|
|
},
|
|
{
|
|
"entropy": 5.397466945648193,
|
|
"epoch": 0.3756003842459174,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004991164681813323,
|
|
"loss": 5.4378,
|
|
"mean_token_accuracy": 0.19209783971309663,
|
|
"num_tokens": 8951748.0,
|
|
"step": 3910
|
|
},
|
|
{
|
|
"entropy": 5.485667037963867,
|
|
"epoch": 0.3760806916426513,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004991134303334807,
|
|
"loss": 5.3588,
|
|
"mean_token_accuracy": 0.19007459729909898,
|
|
"num_tokens": 8962922.0,
|
|
"step": 3915
|
|
},
|
|
{
|
|
"entropy": 5.372178030014038,
|
|
"epoch": 0.3765609990393852,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004991103872823807,
|
|
"loss": 5.3442,
|
|
"mean_token_accuracy": 0.19452154785394668,
|
|
"num_tokens": 8974013.0,
|
|
"step": 3920
|
|
},
|
|
{
|
|
"entropy": 5.436591958999633,
|
|
"epoch": 0.3770413064361191,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000499107339028103,
|
|
"loss": 5.4262,
|
|
"mean_token_accuracy": 0.18169266134500503,
|
|
"num_tokens": 8986032.0,
|
|
"step": 3925
|
|
},
|
|
{
|
|
"entropy": 5.542058515548706,
|
|
"epoch": 0.37752161383285304,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004991042855707184,
|
|
"loss": 5.4187,
|
|
"mean_token_accuracy": 0.1796349912881851,
|
|
"num_tokens": 8996889.0,
|
|
"step": 3930
|
|
},
|
|
{
|
|
"entropy": 5.436617517471314,
|
|
"epoch": 0.3780019212295869,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004991012269102977,
|
|
"loss": 5.3992,
|
|
"mean_token_accuracy": 0.18429471999406816,
|
|
"num_tokens": 9007594.0,
|
|
"step": 3935
|
|
},
|
|
{
|
|
"entropy": 5.426474618911743,
|
|
"epoch": 0.37848222862632086,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004990981630469119,
|
|
"loss": 5.402,
|
|
"mean_token_accuracy": 0.18193352967500687,
|
|
"num_tokens": 9018097.0,
|
|
"step": 3940
|
|
},
|
|
{
|
|
"entropy": 5.5093968391418455,
|
|
"epoch": 0.37896253602305474,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004990950939806323,
|
|
"loss": 5.5113,
|
|
"mean_token_accuracy": 0.18117111474275588,
|
|
"num_tokens": 9029554.0,
|
|
"step": 3945
|
|
},
|
|
{
|
|
"entropy": 5.489337825775147,
|
|
"epoch": 0.3794428434197887,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00049909201971153,
|
|
"loss": 5.3772,
|
|
"mean_token_accuracy": 0.1829820305109024,
|
|
"num_tokens": 9042518.0,
|
|
"step": 3950
|
|
},
|
|
{
|
|
"entropy": 5.421378660202026,
|
|
"epoch": 0.37992315081652256,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004990889402396763,
|
|
"loss": 5.4316,
|
|
"mean_token_accuracy": 0.18639881759881974,
|
|
"num_tokens": 9054524.0,
|
|
"step": 3955
|
|
},
|
|
{
|
|
"entropy": 5.510490798950196,
|
|
"epoch": 0.3804034582132565,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004990858555651431,
|
|
"loss": 5.4016,
|
|
"mean_token_accuracy": 0.18468015938997268,
|
|
"num_tokens": 9065375.0,
|
|
"step": 3960
|
|
},
|
|
{
|
|
"entropy": 5.44808177947998,
|
|
"epoch": 0.3808837656099904,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004990827656880015,
|
|
"loss": 5.3509,
|
|
"mean_token_accuracy": 0.1859322890639305,
|
|
"num_tokens": 9076338.0,
|
|
"step": 3965
|
|
},
|
|
{
|
|
"entropy": 5.432799911499023,
|
|
"epoch": 0.3813640730067243,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004990796706083235,
|
|
"loss": 5.4011,
|
|
"mean_token_accuracy": 0.18659975230693818,
|
|
"num_tokens": 9088407.0,
|
|
"step": 3970
|
|
},
|
|
{
|
|
"entropy": 5.426470470428467,
|
|
"epoch": 0.3818443804034582,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004990765703261809,
|
|
"loss": 5.3649,
|
|
"mean_token_accuracy": 0.18807975053787232,
|
|
"num_tokens": 9099833.0,
|
|
"step": 3975
|
|
},
|
|
{
|
|
"entropy": 5.350304222106933,
|
|
"epoch": 0.38232468780019213,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004990734648416458,
|
|
"loss": 5.3388,
|
|
"mean_token_accuracy": 0.189335997402668,
|
|
"num_tokens": 9111126.0,
|
|
"step": 3980
|
|
},
|
|
{
|
|
"entropy": 5.505539417266846,
|
|
"epoch": 0.382804995196926,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004990703541547901,
|
|
"loss": 5.4548,
|
|
"mean_token_accuracy": 0.1886373370885849,
|
|
"num_tokens": 9121979.0,
|
|
"step": 3985
|
|
},
|
|
{
|
|
"entropy": 5.520917081832886,
|
|
"epoch": 0.38328530259365995,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004990672382656863,
|
|
"loss": 5.4535,
|
|
"mean_token_accuracy": 0.18644375950098038,
|
|
"num_tokens": 9132929.0,
|
|
"step": 3990
|
|
},
|
|
{
|
|
"entropy": 5.485851383209228,
|
|
"epoch": 0.38376560999039383,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004990641171744064,
|
|
"loss": 5.4111,
|
|
"mean_token_accuracy": 0.1882080391049385,
|
|
"num_tokens": 9143903.0,
|
|
"step": 3995
|
|
},
|
|
{
|
|
"entropy": 5.495297384262085,
|
|
"epoch": 0.38424591738712777,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004990609908810231,
|
|
"loss": 5.5045,
|
|
"mean_token_accuracy": 0.18192221075296403,
|
|
"num_tokens": 9154416.0,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"entropy": 5.513756942749024,
|
|
"epoch": 0.38472622478386165,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004990578593856089,
|
|
"loss": 5.4805,
|
|
"mean_token_accuracy": 0.18242392241954802,
|
|
"num_tokens": 9165613.0,
|
|
"step": 4005
|
|
},
|
|
{
|
|
"entropy": 5.4664655208587645,
|
|
"epoch": 0.3852065321805956,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004990547226882366,
|
|
"loss": 5.433,
|
|
"mean_token_accuracy": 0.18787842243909836,
|
|
"num_tokens": 9177884.0,
|
|
"step": 4010
|
|
},
|
|
{
|
|
"entropy": 5.5449103832244875,
|
|
"epoch": 0.38568683957732947,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004990515807889788,
|
|
"loss": 5.5669,
|
|
"mean_token_accuracy": 0.17467134743928908,
|
|
"num_tokens": 9190041.0,
|
|
"step": 4015
|
|
},
|
|
{
|
|
"entropy": 5.556881046295166,
|
|
"epoch": 0.3861671469740634,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004990484336879087,
|
|
"loss": 5.4402,
|
|
"mean_token_accuracy": 0.18740091025829314,
|
|
"num_tokens": 9202390.0,
|
|
"step": 4020
|
|
},
|
|
{
|
|
"entropy": 5.409300327301025,
|
|
"epoch": 0.3866474543707973,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004990452813850992,
|
|
"loss": 5.4373,
|
|
"mean_token_accuracy": 0.18635576069355012,
|
|
"num_tokens": 9213437.0,
|
|
"step": 4025
|
|
},
|
|
{
|
|
"entropy": 5.554971408843994,
|
|
"epoch": 0.3871277617675312,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004990421238806236,
|
|
"loss": 5.517,
|
|
"mean_token_accuracy": 0.17564513981342317,
|
|
"num_tokens": 9226310.0,
|
|
"step": 4030
|
|
},
|
|
{
|
|
"entropy": 5.530429458618164,
|
|
"epoch": 0.38760806916426516,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004990389611745551,
|
|
"loss": 5.4495,
|
|
"mean_token_accuracy": 0.1819504901766777,
|
|
"num_tokens": 9236271.0,
|
|
"step": 4035
|
|
},
|
|
{
|
|
"entropy": 5.516104078292846,
|
|
"epoch": 0.38808837656099904,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004990357932669672,
|
|
"loss": 5.5245,
|
|
"mean_token_accuracy": 0.18500009030103684,
|
|
"num_tokens": 9247755.0,
|
|
"step": 4040
|
|
},
|
|
{
|
|
"entropy": 5.464123487472534,
|
|
"epoch": 0.388568683957733,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004990326201579335,
|
|
"loss": 5.361,
|
|
"mean_token_accuracy": 0.19129124879837037,
|
|
"num_tokens": 9259821.0,
|
|
"step": 4045
|
|
},
|
|
{
|
|
"entropy": 5.4668073654174805,
|
|
"epoch": 0.38904899135446686,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004990294418475274,
|
|
"loss": 5.4631,
|
|
"mean_token_accuracy": 0.18641964942216874,
|
|
"num_tokens": 9270663.0,
|
|
"step": 4050
|
|
},
|
|
{
|
|
"entropy": 5.465627670288086,
|
|
"epoch": 0.3895292987512008,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004990262583358231,
|
|
"loss": 5.4879,
|
|
"mean_token_accuracy": 0.17998379915952684,
|
|
"num_tokens": 9282588.0,
|
|
"step": 4055
|
|
},
|
|
{
|
|
"entropy": 5.510502290725708,
|
|
"epoch": 0.3900096061479347,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004990230696228943,
|
|
"loss": 5.4397,
|
|
"mean_token_accuracy": 0.17829088270664215,
|
|
"num_tokens": 9293368.0,
|
|
"step": 4060
|
|
},
|
|
{
|
|
"entropy": 5.477728748321534,
|
|
"epoch": 0.3904899135446686,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004990198757088149,
|
|
"loss": 5.5128,
|
|
"mean_token_accuracy": 0.1811017781496048,
|
|
"num_tokens": 9305962.0,
|
|
"step": 4065
|
|
},
|
|
{
|
|
"entropy": 5.508330774307251,
|
|
"epoch": 0.3909702209414025,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004990166765936593,
|
|
"loss": 5.393,
|
|
"mean_token_accuracy": 0.19244694262742995,
|
|
"num_tokens": 9317955.0,
|
|
"step": 4070
|
|
},
|
|
{
|
|
"entropy": 5.450256824493408,
|
|
"epoch": 0.3914505283381364,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004990134722775016,
|
|
"loss": 5.3934,
|
|
"mean_token_accuracy": 0.19047792106866837,
|
|
"num_tokens": 9329491.0,
|
|
"step": 4075
|
|
},
|
|
{
|
|
"entropy": 5.451663637161255,
|
|
"epoch": 0.3919308357348703,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004990102627604162,
|
|
"loss": 5.5273,
|
|
"mean_token_accuracy": 0.19028781056404115,
|
|
"num_tokens": 9341612.0,
|
|
"step": 4080
|
|
},
|
|
{
|
|
"entropy": 5.524235773086548,
|
|
"epoch": 0.39241114313160425,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004990070480424778,
|
|
"loss": 5.458,
|
|
"mean_token_accuracy": 0.18043633103370665,
|
|
"num_tokens": 9352302.0,
|
|
"step": 4085
|
|
},
|
|
{
|
|
"entropy": 5.440912199020386,
|
|
"epoch": 0.3928914505283381,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004990038281237608,
|
|
"loss": 5.3919,
|
|
"mean_token_accuracy": 0.1852226436138153,
|
|
"num_tokens": 9363303.0,
|
|
"step": 4090
|
|
},
|
|
{
|
|
"entropy": 5.433840227127075,
|
|
"epoch": 0.39337175792507206,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004990006030043401,
|
|
"loss": 5.3732,
|
|
"mean_token_accuracy": 0.1849522888660431,
|
|
"num_tokens": 9375878.0,
|
|
"step": 4095
|
|
},
|
|
{
|
|
"entropy": 5.470492124557495,
|
|
"epoch": 0.39385206532180594,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004989973726842906,
|
|
"loss": 5.4145,
|
|
"mean_token_accuracy": 0.18103147149086,
|
|
"num_tokens": 9388342.0,
|
|
"step": 4100
|
|
},
|
|
{
|
|
"entropy": 5.44459342956543,
|
|
"epoch": 0.3943323727185399,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004989941371636872,
|
|
"loss": 5.3549,
|
|
"mean_token_accuracy": 0.1901955187320709,
|
|
"num_tokens": 9399047.0,
|
|
"step": 4105
|
|
},
|
|
{
|
|
"entropy": 5.449139881134033,
|
|
"epoch": 0.39481268011527376,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004989908964426051,
|
|
"loss": 5.4342,
|
|
"mean_token_accuracy": 0.18933464139699935,
|
|
"num_tokens": 9410172.0,
|
|
"step": 4110
|
|
},
|
|
{
|
|
"entropy": 5.547493505477905,
|
|
"epoch": 0.3952929875120077,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004989876505211194,
|
|
"loss": 5.5794,
|
|
"mean_token_accuracy": 0.17717085629701615,
|
|
"num_tokens": 9422287.0,
|
|
"step": 4115
|
|
},
|
|
{
|
|
"entropy": 5.5754584789276125,
|
|
"epoch": 0.3957732949087416,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004989843993993056,
|
|
"loss": 5.44,
|
|
"mean_token_accuracy": 0.18759053498506545,
|
|
"num_tokens": 9433709.0,
|
|
"step": 4120
|
|
},
|
|
{
|
|
"entropy": 5.341240167617798,
|
|
"epoch": 0.3962536023054755,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004989811430772392,
|
|
"loss": 5.3199,
|
|
"mean_token_accuracy": 0.189169280230999,
|
|
"num_tokens": 9445138.0,
|
|
"step": 4125
|
|
},
|
|
{
|
|
"entropy": 5.4137170791625975,
|
|
"epoch": 0.3967339097022094,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004989778815549957,
|
|
"loss": 5.4579,
|
|
"mean_token_accuracy": 0.1827932521700859,
|
|
"num_tokens": 9455263.0,
|
|
"step": 4130
|
|
},
|
|
{
|
|
"entropy": 5.533003664016723,
|
|
"epoch": 0.39721421709894333,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004989746148326508,
|
|
"loss": 5.4184,
|
|
"mean_token_accuracy": 0.18644048422574996,
|
|
"num_tokens": 9465491.0,
|
|
"step": 4135
|
|
},
|
|
{
|
|
"entropy": 5.372505331039429,
|
|
"epoch": 0.3976945244956772,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004989713429102805,
|
|
"loss": 5.3821,
|
|
"mean_token_accuracy": 0.1837732046842575,
|
|
"num_tokens": 9477601.0,
|
|
"step": 4140
|
|
},
|
|
{
|
|
"entropy": 5.426533985137939,
|
|
"epoch": 0.39817483189241115,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004989680657879607,
|
|
"loss": 5.4426,
|
|
"mean_token_accuracy": 0.18387902528047562,
|
|
"num_tokens": 9489385.0,
|
|
"step": 4145
|
|
},
|
|
{
|
|
"entropy": 5.473710680007935,
|
|
"epoch": 0.39865513928914503,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004989647834657675,
|
|
"loss": 5.3249,
|
|
"mean_token_accuracy": 0.19230013936758042,
|
|
"num_tokens": 9501131.0,
|
|
"step": 4150
|
|
},
|
|
{
|
|
"entropy": 5.420683908462524,
|
|
"epoch": 0.39913544668587897,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.000498961495943777,
|
|
"loss": 5.4614,
|
|
"mean_token_accuracy": 0.18854968398809432,
|
|
"num_tokens": 9513094.0,
|
|
"step": 4155
|
|
},
|
|
{
|
|
"entropy": 5.577786207199097,
|
|
"epoch": 0.39961575408261285,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004989582032220656,
|
|
"loss": 5.5832,
|
|
"mean_token_accuracy": 0.17526223361492158,
|
|
"num_tokens": 9524538.0,
|
|
"step": 4160
|
|
},
|
|
{
|
|
"entropy": 5.522935295104981,
|
|
"epoch": 0.4000960614793468,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004989549053007096,
|
|
"loss": 5.3961,
|
|
"mean_token_accuracy": 0.19305580705404282,
|
|
"num_tokens": 9535284.0,
|
|
"step": 4165
|
|
},
|
|
{
|
|
"entropy": 5.462124681472778,
|
|
"epoch": 0.40057636887608067,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004989516021797858,
|
|
"loss": 5.471,
|
|
"mean_token_accuracy": 0.18390081077814102,
|
|
"num_tokens": 9546472.0,
|
|
"step": 4170
|
|
},
|
|
{
|
|
"entropy": 5.499347305297851,
|
|
"epoch": 0.4010566762728146,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000498948293859371,
|
|
"loss": 5.4605,
|
|
"mean_token_accuracy": 0.18212546557188034,
|
|
"num_tokens": 9558358.0,
|
|
"step": 4175
|
|
},
|
|
{
|
|
"entropy": 5.496229076385498,
|
|
"epoch": 0.4015369836695485,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004989449803395415,
|
|
"loss": 5.4959,
|
|
"mean_token_accuracy": 0.18471186012029647,
|
|
"num_tokens": 9570653.0,
|
|
"step": 4180
|
|
},
|
|
{
|
|
"entropy": 5.556100845336914,
|
|
"epoch": 0.4020172910662824,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004989416616203747,
|
|
"loss": 5.4386,
|
|
"mean_token_accuracy": 0.18714374899864197,
|
|
"num_tokens": 9582150.0,
|
|
"step": 4185
|
|
},
|
|
{
|
|
"entropy": 5.4823558807373045,
|
|
"epoch": 0.4024975984630163,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004989383377019476,
|
|
"loss": 5.38,
|
|
"mean_token_accuracy": 0.19184014648199083,
|
|
"num_tokens": 9592462.0,
|
|
"step": 4190
|
|
},
|
|
{
|
|
"entropy": 5.375227689743042,
|
|
"epoch": 0.40297790585975024,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004989350085843371,
|
|
"loss": 5.374,
|
|
"mean_token_accuracy": 0.18951477408409118,
|
|
"num_tokens": 9604027.0,
|
|
"step": 4195
|
|
},
|
|
{
|
|
"entropy": 5.387249088287353,
|
|
"epoch": 0.4034582132564842,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0004989316742676207,
|
|
"loss": 5.3733,
|
|
"mean_token_accuracy": 0.19109322130680084,
|
|
"num_tokens": 9616325.0,
|
|
"step": 4200
|
|
},
|
|
{
|
|
"entropy": 5.396379852294922,
|
|
"epoch": 0.40393852065321806,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004989283347518757,
|
|
"loss": 5.3338,
|
|
"mean_token_accuracy": 0.18609212040901185,
|
|
"num_tokens": 9628133.0,
|
|
"step": 4205
|
|
},
|
|
{
|
|
"entropy": 5.579652786254883,
|
|
"epoch": 0.404418828049952,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004989249900371797,
|
|
"loss": 5.5629,
|
|
"mean_token_accuracy": 0.17861852645874024,
|
|
"num_tokens": 9639686.0,
|
|
"step": 4210
|
|
},
|
|
{
|
|
"entropy": 5.429533529281616,
|
|
"epoch": 0.4048991354466859,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004989216401236103,
|
|
"loss": 5.4184,
|
|
"mean_token_accuracy": 0.18496839255094527,
|
|
"num_tokens": 9650222.0,
|
|
"step": 4215
|
|
},
|
|
{
|
|
"entropy": 5.367856836318969,
|
|
"epoch": 0.4053794428434198,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004989182850112455,
|
|
"loss": 5.3417,
|
|
"mean_token_accuracy": 0.1997272178530693,
|
|
"num_tokens": 9661792.0,
|
|
"step": 4220
|
|
},
|
|
{
|
|
"entropy": 5.516646957397461,
|
|
"epoch": 0.4058597502401537,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004989149247001629,
|
|
"loss": 5.4497,
|
|
"mean_token_accuracy": 0.18383817970752717,
|
|
"num_tokens": 9673000.0,
|
|
"step": 4225
|
|
},
|
|
{
|
|
"entropy": 5.532714462280273,
|
|
"epoch": 0.40634005763688763,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004989115591904407,
|
|
"loss": 5.3975,
|
|
"mean_token_accuracy": 0.1901587262749672,
|
|
"num_tokens": 9685253.0,
|
|
"step": 4230
|
|
},
|
|
{
|
|
"entropy": 5.391170501708984,
|
|
"epoch": 0.4068203650336215,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004989081884821569,
|
|
"loss": 5.4004,
|
|
"mean_token_accuracy": 0.18320820480585098,
|
|
"num_tokens": 9697245.0,
|
|
"step": 4235
|
|
},
|
|
{
|
|
"entropy": 5.450364589691162,
|
|
"epoch": 0.40730067243035545,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004989048125753899,
|
|
"loss": 5.4156,
|
|
"mean_token_accuracy": 0.18504445552825927,
|
|
"num_tokens": 9710095.0,
|
|
"step": 4240
|
|
},
|
|
{
|
|
"entropy": 5.407678937911987,
|
|
"epoch": 0.40778097982708933,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000498901431470218,
|
|
"loss": 5.2919,
|
|
"mean_token_accuracy": 0.19396644979715347,
|
|
"num_tokens": 9721488.0,
|
|
"step": 4245
|
|
},
|
|
{
|
|
"entropy": 5.2491998195648195,
|
|
"epoch": 0.40826128722382327,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004988980451667198,
|
|
"loss": 5.255,
|
|
"mean_token_accuracy": 0.19170391261577607,
|
|
"num_tokens": 9733280.0,
|
|
"step": 4250
|
|
},
|
|
{
|
|
"entropy": 5.455927753448487,
|
|
"epoch": 0.40874159462055715,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004988946536649737,
|
|
"loss": 5.3863,
|
|
"mean_token_accuracy": 0.18661659061908722,
|
|
"num_tokens": 9744514.0,
|
|
"step": 4255
|
|
},
|
|
{
|
|
"entropy": 5.413423871994018,
|
|
"epoch": 0.4092219020172911,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004988912569650585,
|
|
"loss": 5.3752,
|
|
"mean_token_accuracy": 0.19112140834331512,
|
|
"num_tokens": 9754931.0,
|
|
"step": 4260
|
|
},
|
|
{
|
|
"entropy": 5.389836359024048,
|
|
"epoch": 0.40970220941402496,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004988878550670533,
|
|
"loss": 5.3725,
|
|
"mean_token_accuracy": 0.19297343790531157,
|
|
"num_tokens": 9765635.0,
|
|
"step": 4265
|
|
},
|
|
{
|
|
"entropy": 5.508016872406006,
|
|
"epoch": 0.4101825168107589,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004988844479710369,
|
|
"loss": 5.4792,
|
|
"mean_token_accuracy": 0.18072771430015563,
|
|
"num_tokens": 9777512.0,
|
|
"step": 4270
|
|
},
|
|
{
|
|
"entropy": 5.541130542755127,
|
|
"epoch": 0.4106628242074928,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004988810356770884,
|
|
"loss": 5.4764,
|
|
"mean_token_accuracy": 0.1744610548019409,
|
|
"num_tokens": 9790128.0,
|
|
"step": 4275
|
|
},
|
|
{
|
|
"entropy": 5.451146841049194,
|
|
"epoch": 0.4111431316042267,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.000498877618185287,
|
|
"loss": 5.4112,
|
|
"mean_token_accuracy": 0.19078320413827896,
|
|
"num_tokens": 9802549.0,
|
|
"step": 4280
|
|
},
|
|
{
|
|
"entropy": 5.365971374511719,
|
|
"epoch": 0.4116234390009606,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004988741954957121,
|
|
"loss": 5.3574,
|
|
"mean_token_accuracy": 0.18884203881025313,
|
|
"num_tokens": 9813736.0,
|
|
"step": 4285
|
|
},
|
|
{
|
|
"entropy": 5.380771827697754,
|
|
"epoch": 0.41210374639769454,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004988707676084432,
|
|
"loss": 5.3584,
|
|
"mean_token_accuracy": 0.19705824106931685,
|
|
"num_tokens": 9823785.0,
|
|
"step": 4290
|
|
},
|
|
{
|
|
"entropy": 5.432324981689453,
|
|
"epoch": 0.4125840537944284,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004988673345235597,
|
|
"loss": 5.3197,
|
|
"mean_token_accuracy": 0.1934140741825104,
|
|
"num_tokens": 9834910.0,
|
|
"step": 4295
|
|
},
|
|
{
|
|
"entropy": 5.437625408172607,
|
|
"epoch": 0.41306436119116235,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004988638962411416,
|
|
"loss": 5.363,
|
|
"mean_token_accuracy": 0.18818716257810592,
|
|
"num_tokens": 9845593.0,
|
|
"step": 4300
|
|
},
|
|
{
|
|
"entropy": 5.392855072021485,
|
|
"epoch": 0.41354466858789624,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004988604527612685,
|
|
"loss": 5.2697,
|
|
"mean_token_accuracy": 0.2009762555360794,
|
|
"num_tokens": 9856763.0,
|
|
"step": 4305
|
|
},
|
|
{
|
|
"entropy": 5.503190565109253,
|
|
"epoch": 0.4140249759846302,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004988570040840205,
|
|
"loss": 5.4945,
|
|
"mean_token_accuracy": 0.18051616251468658,
|
|
"num_tokens": 9869528.0,
|
|
"step": 4310
|
|
},
|
|
{
|
|
"entropy": 5.407845735549927,
|
|
"epoch": 0.41450528338136405,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004988535502094774,
|
|
"loss": 5.3958,
|
|
"mean_token_accuracy": 0.18804680705070495,
|
|
"num_tokens": 9881170.0,
|
|
"step": 4315
|
|
},
|
|
{
|
|
"entropy": 5.461514711380005,
|
|
"epoch": 0.414985590778098,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004988500911377198,
|
|
"loss": 5.4803,
|
|
"mean_token_accuracy": 0.18439086973667146,
|
|
"num_tokens": 9893119.0,
|
|
"step": 4320
|
|
},
|
|
{
|
|
"entropy": 5.368999385833741,
|
|
"epoch": 0.41546589817483187,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004988466268688276,
|
|
"loss": 5.3154,
|
|
"mean_token_accuracy": 0.19932861626148224,
|
|
"num_tokens": 9905339.0,
|
|
"step": 4325
|
|
},
|
|
{
|
|
"entropy": 5.482837677001953,
|
|
"epoch": 0.4159462055715658,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004988431574028814,
|
|
"loss": 5.4002,
|
|
"mean_token_accuracy": 0.19202394932508468,
|
|
"num_tokens": 9917500.0,
|
|
"step": 4330
|
|
},
|
|
{
|
|
"entropy": 5.466025495529175,
|
|
"epoch": 0.4164265129682997,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004988396827399618,
|
|
"loss": 5.4808,
|
|
"mean_token_accuracy": 0.18326758295297624,
|
|
"num_tokens": 9929667.0,
|
|
"step": 4335
|
|
},
|
|
{
|
|
"entropy": 5.48503007888794,
|
|
"epoch": 0.4169068203650336,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004988362028801495,
|
|
"loss": 5.4048,
|
|
"mean_token_accuracy": 0.18796583414077758,
|
|
"num_tokens": 9941102.0,
|
|
"step": 4340
|
|
},
|
|
{
|
|
"entropy": 5.412125444412231,
|
|
"epoch": 0.4173871277617675,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004988327178235253,
|
|
"loss": 5.3058,
|
|
"mean_token_accuracy": 0.1973835989832878,
|
|
"num_tokens": 9951986.0,
|
|
"step": 4345
|
|
},
|
|
{
|
|
"entropy": 5.383547782897949,
|
|
"epoch": 0.41786743515850144,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0004988292275701699,
|
|
"loss": 5.3119,
|
|
"mean_token_accuracy": 0.19086995273828505,
|
|
"num_tokens": 9964486.0,
|
|
"step": 4350
|
|
},
|
|
{
|
|
"entropy": 5.406881952285767,
|
|
"epoch": 0.4183477425552353,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004988257321201646,
|
|
"loss": 5.4094,
|
|
"mean_token_accuracy": 0.1860354095697403,
|
|
"num_tokens": 9975909.0,
|
|
"step": 4355
|
|
},
|
|
{
|
|
"entropy": 5.473488092422485,
|
|
"epoch": 0.41882804995196926,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004988222314735902,
|
|
"loss": 5.4171,
|
|
"mean_token_accuracy": 0.18617332428693772,
|
|
"num_tokens": 9986951.0,
|
|
"step": 4360
|
|
},
|
|
{
|
|
"entropy": 5.517805814743042,
|
|
"epoch": 0.41930835734870314,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004988187256305284,
|
|
"loss": 5.5057,
|
|
"mean_token_accuracy": 0.1791812226176262,
|
|
"num_tokens": 9999234.0,
|
|
"step": 4365
|
|
},
|
|
{
|
|
"entropy": 5.405948638916016,
|
|
"epoch": 0.4197886647454371,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004988152145910603,
|
|
"loss": 5.3792,
|
|
"mean_token_accuracy": 0.1959477871656418,
|
|
"num_tokens": 10010178.0,
|
|
"step": 4370
|
|
},
|
|
{
|
|
"entropy": 5.391415548324585,
|
|
"epoch": 0.420268972142171,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004988116983552675,
|
|
"loss": 5.3218,
|
|
"mean_token_accuracy": 0.18838354647159578,
|
|
"num_tokens": 10021183.0,
|
|
"step": 4375
|
|
},
|
|
{
|
|
"entropy": 5.590651321411133,
|
|
"epoch": 0.4207492795389049,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004988081769232317,
|
|
"loss": 5.6204,
|
|
"mean_token_accuracy": 0.17428677082061766,
|
|
"num_tokens": 10033686.0,
|
|
"step": 4380
|
|
},
|
|
{
|
|
"entropy": 5.384156322479248,
|
|
"epoch": 0.42122958693563883,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004988046502950346,
|
|
"loss": 5.3079,
|
|
"mean_token_accuracy": 0.187077134847641,
|
|
"num_tokens": 10045923.0,
|
|
"step": 4385
|
|
},
|
|
{
|
|
"entropy": 5.270208120346069,
|
|
"epoch": 0.4217098943323727,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.000498801118470758,
|
|
"loss": 5.2402,
|
|
"mean_token_accuracy": 0.19899773895740508,
|
|
"num_tokens": 10057196.0,
|
|
"step": 4390
|
|
},
|
|
{
|
|
"entropy": 5.409784030914307,
|
|
"epoch": 0.42219020172910665,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000498797581450484,
|
|
"loss": 5.4295,
|
|
"mean_token_accuracy": 0.18354050666093827,
|
|
"num_tokens": 10069655.0,
|
|
"step": 4395
|
|
},
|
|
{
|
|
"entropy": 5.448616600036621,
|
|
"epoch": 0.42267050912584053,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004987940392342948,
|
|
"loss": 5.3095,
|
|
"mean_token_accuracy": 0.19377071112394334,
|
|
"num_tokens": 10080876.0,
|
|
"step": 4400
|
|
},
|
|
{
|
|
"entropy": 5.421027898788452,
|
|
"epoch": 0.42315081652257447,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004987904918222726,
|
|
"loss": 5.415,
|
|
"mean_token_accuracy": 0.18513490557670592,
|
|
"num_tokens": 10091986.0,
|
|
"step": 4405
|
|
},
|
|
{
|
|
"entropy": 5.5097509860992435,
|
|
"epoch": 0.42363112391930835,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004987869392144996,
|
|
"loss": 5.499,
|
|
"mean_token_accuracy": 0.18492884635925294,
|
|
"num_tokens": 10104027.0,
|
|
"step": 4410
|
|
},
|
|
{
|
|
"entropy": 5.425499534606933,
|
|
"epoch": 0.4241114313160423,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004987833814110584,
|
|
"loss": 5.3567,
|
|
"mean_token_accuracy": 0.1865203857421875,
|
|
"num_tokens": 10114665.0,
|
|
"step": 4415
|
|
},
|
|
{
|
|
"entropy": 5.385516119003296,
|
|
"epoch": 0.42459173871277617,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004987798184120316,
|
|
"loss": 5.3742,
|
|
"mean_token_accuracy": 0.19014959633350373,
|
|
"num_tokens": 10126032.0,
|
|
"step": 4420
|
|
},
|
|
{
|
|
"entropy": 5.512171411514283,
|
|
"epoch": 0.4250720461095101,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004987762502175018,
|
|
"loss": 5.4288,
|
|
"mean_token_accuracy": 0.1829407036304474,
|
|
"num_tokens": 10137256.0,
|
|
"step": 4425
|
|
},
|
|
{
|
|
"entropy": 5.3579336643219,
|
|
"epoch": 0.425552353506244,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000498772676827552,
|
|
"loss": 5.3117,
|
|
"mean_token_accuracy": 0.1916539713740349,
|
|
"num_tokens": 10149445.0,
|
|
"step": 4430
|
|
},
|
|
{
|
|
"entropy": 5.474416351318359,
|
|
"epoch": 0.4260326609029779,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004987690982422652,
|
|
"loss": 5.4495,
|
|
"mean_token_accuracy": 0.18037094324827194,
|
|
"num_tokens": 10161607.0,
|
|
"step": 4435
|
|
},
|
|
{
|
|
"entropy": 5.448618030548095,
|
|
"epoch": 0.4265129682997118,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004987655144617243,
|
|
"loss": 5.4681,
|
|
"mean_token_accuracy": 0.18403236269950868,
|
|
"num_tokens": 10173184.0,
|
|
"step": 4440
|
|
},
|
|
{
|
|
"entropy": 5.4251587867736815,
|
|
"epoch": 0.42699327569644574,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004987619254860126,
|
|
"loss": 5.328,
|
|
"mean_token_accuracy": 0.19698531180620193,
|
|
"num_tokens": 10184617.0,
|
|
"step": 4445
|
|
},
|
|
{
|
|
"entropy": 5.4672339916229244,
|
|
"epoch": 0.4274735830931796,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004987583313152134,
|
|
"loss": 5.3568,
|
|
"mean_token_accuracy": 0.18906597346067427,
|
|
"num_tokens": 10195608.0,
|
|
"step": 4450
|
|
},
|
|
{
|
|
"entropy": 5.386989736557007,
|
|
"epoch": 0.42795389048991356,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004987547319494104,
|
|
"loss": 5.4529,
|
|
"mean_token_accuracy": 0.18423379063606263,
|
|
"num_tokens": 10206763.0,
|
|
"step": 4455
|
|
},
|
|
{
|
|
"entropy": 5.486404466629028,
|
|
"epoch": 0.42843419788664744,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004987511273886867,
|
|
"loss": 5.3933,
|
|
"mean_token_accuracy": 0.1908423647284508,
|
|
"num_tokens": 10218714.0,
|
|
"step": 4460
|
|
},
|
|
{
|
|
"entropy": 5.427644729614258,
|
|
"epoch": 0.4289145052833814,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004987475176331263,
|
|
"loss": 5.415,
|
|
"mean_token_accuracy": 0.18401106595993041,
|
|
"num_tokens": 10229902.0,
|
|
"step": 4465
|
|
},
|
|
{
|
|
"entropy": 5.423227453231812,
|
|
"epoch": 0.42939481268011526,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004987439026828129,
|
|
"loss": 5.288,
|
|
"mean_token_accuracy": 0.19139131158590317,
|
|
"num_tokens": 10241578.0,
|
|
"step": 4470
|
|
},
|
|
{
|
|
"entropy": 5.324700498580933,
|
|
"epoch": 0.4298751200768492,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004987402825378305,
|
|
"loss": 5.2595,
|
|
"mean_token_accuracy": 0.19443607479333877,
|
|
"num_tokens": 10252109.0,
|
|
"step": 4475
|
|
},
|
|
{
|
|
"entropy": 5.429213285446167,
|
|
"epoch": 0.4303554274735831,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004987366571982631,
|
|
"loss": 5.4252,
|
|
"mean_token_accuracy": 0.18883214443922042,
|
|
"num_tokens": 10263357.0,
|
|
"step": 4480
|
|
},
|
|
{
|
|
"entropy": 5.487810945510864,
|
|
"epoch": 0.430835734870317,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004987330266641948,
|
|
"loss": 5.4308,
|
|
"mean_token_accuracy": 0.18471152931451798,
|
|
"num_tokens": 10275536.0,
|
|
"step": 4485
|
|
},
|
|
{
|
|
"entropy": 5.453687620162964,
|
|
"epoch": 0.4313160422670509,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004987293909357101,
|
|
"loss": 5.415,
|
|
"mean_token_accuracy": 0.19442622363567352,
|
|
"num_tokens": 10286901.0,
|
|
"step": 4490
|
|
},
|
|
{
|
|
"entropy": 5.365311050415039,
|
|
"epoch": 0.43179634966378483,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004987257500128933,
|
|
"loss": 5.3172,
|
|
"mean_token_accuracy": 0.18610639423131942,
|
|
"num_tokens": 10298961.0,
|
|
"step": 4495
|
|
},
|
|
{
|
|
"entropy": 5.462113523483277,
|
|
"epoch": 0.4322766570605187,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004987221038958288,
|
|
"loss": 5.4543,
|
|
"mean_token_accuracy": 0.18748044222593307,
|
|
"num_tokens": 10310911.0,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"entropy": 5.510283613204956,
|
|
"epoch": 0.43275696445725265,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004987184525846015,
|
|
"loss": 5.4389,
|
|
"mean_token_accuracy": 0.1841048017144203,
|
|
"num_tokens": 10322267.0,
|
|
"step": 4505
|
|
},
|
|
{
|
|
"entropy": 5.411655378341675,
|
|
"epoch": 0.4332372718539865,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004987147960792958,
|
|
"loss": 5.459,
|
|
"mean_token_accuracy": 0.18804670721292496,
|
|
"num_tokens": 10335111.0,
|
|
"step": 4510
|
|
},
|
|
{
|
|
"entropy": 5.520284938812256,
|
|
"epoch": 0.43371757925072046,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004987111343799971,
|
|
"loss": 5.3974,
|
|
"mean_token_accuracy": 0.1907435804605484,
|
|
"num_tokens": 10345672.0,
|
|
"step": 4515
|
|
},
|
|
{
|
|
"entropy": 5.501500225067138,
|
|
"epoch": 0.43419788664745435,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00049870746748679,
|
|
"loss": 5.3725,
|
|
"mean_token_accuracy": 0.1861974611878395,
|
|
"num_tokens": 10357369.0,
|
|
"step": 4520
|
|
},
|
|
{
|
|
"entropy": 5.38987283706665,
|
|
"epoch": 0.4346781940441883,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004987037953997598,
|
|
"loss": 5.3935,
|
|
"mean_token_accuracy": 0.18683493435382842,
|
|
"num_tokens": 10368842.0,
|
|
"step": 4525
|
|
},
|
|
{
|
|
"entropy": 5.43892183303833,
|
|
"epoch": 0.43515850144092216,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004987001181189918,
|
|
"loss": 5.3539,
|
|
"mean_token_accuracy": 0.18663013726472855,
|
|
"num_tokens": 10380096.0,
|
|
"step": 4530
|
|
},
|
|
{
|
|
"entropy": 5.306481552124024,
|
|
"epoch": 0.4356388088376561,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004986964356445713,
|
|
"loss": 5.3772,
|
|
"mean_token_accuracy": 0.19005681425333024,
|
|
"num_tokens": 10391996.0,
|
|
"step": 4535
|
|
},
|
|
{
|
|
"entropy": 5.48760027885437,
|
|
"epoch": 0.43611911623439004,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004986927479765837,
|
|
"loss": 5.3288,
|
|
"mean_token_accuracy": 0.18343985229730606,
|
|
"num_tokens": 10403607.0,
|
|
"step": 4540
|
|
},
|
|
{
|
|
"entropy": 5.396467876434326,
|
|
"epoch": 0.4365994236311239,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004986890551151148,
|
|
"loss": 5.3604,
|
|
"mean_token_accuracy": 0.184589384496212,
|
|
"num_tokens": 10413580.0,
|
|
"step": 4545
|
|
},
|
|
{
|
|
"entropy": 5.349568462371826,
|
|
"epoch": 0.43707973102785785,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004986853570602503,
|
|
"loss": 5.3881,
|
|
"mean_token_accuracy": 0.18719975054264068,
|
|
"num_tokens": 10426456.0,
|
|
"step": 4550
|
|
},
|
|
{
|
|
"entropy": 5.520879220962525,
|
|
"epoch": 0.43756003842459174,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004986816538120758,
|
|
"loss": 5.4101,
|
|
"mean_token_accuracy": 0.18188669979572297,
|
|
"num_tokens": 10438869.0,
|
|
"step": 4555
|
|
},
|
|
{
|
|
"entropy": 5.397240781784058,
|
|
"epoch": 0.43804034582132567,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004986779453706778,
|
|
"loss": 5.4142,
|
|
"mean_token_accuracy": 0.1816550999879837,
|
|
"num_tokens": 10450672.0,
|
|
"step": 4560
|
|
},
|
|
{
|
|
"entropy": 5.4152685642242435,
|
|
"epoch": 0.43852065321805955,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004986742317361419,
|
|
"loss": 5.3271,
|
|
"mean_token_accuracy": 0.19575155526399612,
|
|
"num_tokens": 10461890.0,
|
|
"step": 4565
|
|
},
|
|
{
|
|
"entropy": 5.498744964599609,
|
|
"epoch": 0.4390009606147935,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004986705129085546,
|
|
"loss": 5.4613,
|
|
"mean_token_accuracy": 0.17549378722906112,
|
|
"num_tokens": 10473866.0,
|
|
"step": 4570
|
|
},
|
|
{
|
|
"entropy": 5.460689496994019,
|
|
"epoch": 0.43948126801152737,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004986667888880021,
|
|
"loss": 5.381,
|
|
"mean_token_accuracy": 0.18632390201091767,
|
|
"num_tokens": 10484889.0,
|
|
"step": 4575
|
|
},
|
|
{
|
|
"entropy": 5.412662744522095,
|
|
"epoch": 0.4399615754082613,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004986630596745709,
|
|
"loss": 5.4207,
|
|
"mean_token_accuracy": 0.1880632683634758,
|
|
"num_tokens": 10496108.0,
|
|
"step": 4580
|
|
},
|
|
{
|
|
"entropy": 5.389367771148682,
|
|
"epoch": 0.4404418828049952,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004986593252683477,
|
|
"loss": 5.363,
|
|
"mean_token_accuracy": 0.18732869774103164,
|
|
"num_tokens": 10505472.0,
|
|
"step": 4585
|
|
},
|
|
{
|
|
"entropy": 5.307269144058227,
|
|
"epoch": 0.4409221902017291,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004986555856694191,
|
|
"loss": 5.2773,
|
|
"mean_token_accuracy": 0.19333918690681456,
|
|
"num_tokens": 10516954.0,
|
|
"step": 4590
|
|
},
|
|
{
|
|
"entropy": 5.524228239059449,
|
|
"epoch": 0.441402497598463,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004986518408778718,
|
|
"loss": 5.3859,
|
|
"mean_token_accuracy": 0.18945636600255966,
|
|
"num_tokens": 10528166.0,
|
|
"step": 4595
|
|
},
|
|
{
|
|
"entropy": 5.38381519317627,
|
|
"epoch": 0.44188280499519694,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004986480908937929,
|
|
"loss": 5.3113,
|
|
"mean_token_accuracy": 0.18772315680980683,
|
|
"num_tokens": 10538112.0,
|
|
"step": 4600
|
|
},
|
|
{
|
|
"entropy": 5.444307518005371,
|
|
"epoch": 0.4423631123919308,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004986443357172695,
|
|
"loss": 5.4568,
|
|
"mean_token_accuracy": 0.18497458845376968,
|
|
"num_tokens": 10549888.0,
|
|
"step": 4605
|
|
},
|
|
{
|
|
"entropy": 5.58274884223938,
|
|
"epoch": 0.44284341978866476,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004986405753483887,
|
|
"loss": 5.5294,
|
|
"mean_token_accuracy": 0.17502811402082444,
|
|
"num_tokens": 10561710.0,
|
|
"step": 4610
|
|
},
|
|
{
|
|
"entropy": 5.410598850250244,
|
|
"epoch": 0.44332372718539864,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004986368097872377,
|
|
"loss": 5.379,
|
|
"mean_token_accuracy": 0.18401092439889907,
|
|
"num_tokens": 10574564.0,
|
|
"step": 4615
|
|
},
|
|
{
|
|
"entropy": 5.41968560218811,
|
|
"epoch": 0.4438040345821326,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004986330390339042,
|
|
"loss": 5.3586,
|
|
"mean_token_accuracy": 0.18878330439329147,
|
|
"num_tokens": 10586639.0,
|
|
"step": 4620
|
|
},
|
|
{
|
|
"entropy": 5.373893547058105,
|
|
"epoch": 0.44428434197886646,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004986292630884755,
|
|
"loss": 5.3645,
|
|
"mean_token_accuracy": 0.18980913162231444,
|
|
"num_tokens": 10598730.0,
|
|
"step": 4625
|
|
},
|
|
{
|
|
"entropy": 5.395772886276245,
|
|
"epoch": 0.4447646493756004,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004986254819510393,
|
|
"loss": 5.2863,
|
|
"mean_token_accuracy": 0.2030077889561653,
|
|
"num_tokens": 10610352.0,
|
|
"step": 4630
|
|
},
|
|
{
|
|
"entropy": 5.410120058059692,
|
|
"epoch": 0.4452449567723343,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004986216956216835,
|
|
"loss": 5.3544,
|
|
"mean_token_accuracy": 0.18991922438144684,
|
|
"num_tokens": 10621951.0,
|
|
"step": 4635
|
|
},
|
|
{
|
|
"entropy": 5.380520057678223,
|
|
"epoch": 0.4457252641690682,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000498617904100496,
|
|
"loss": 5.3114,
|
|
"mean_token_accuracy": 0.1913859009742737,
|
|
"num_tokens": 10633207.0,
|
|
"step": 4640
|
|
},
|
|
{
|
|
"entropy": 5.473378133773804,
|
|
"epoch": 0.4462055715658021,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004986141073875646,
|
|
"loss": 5.4035,
|
|
"mean_token_accuracy": 0.18385644257068634,
|
|
"num_tokens": 10645853.0,
|
|
"step": 4645
|
|
},
|
|
{
|
|
"entropy": 5.330105209350586,
|
|
"epoch": 0.44668587896253603,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004986103054829779,
|
|
"loss": 5.3305,
|
|
"mean_token_accuracy": 0.18985379487276077,
|
|
"num_tokens": 10656892.0,
|
|
"step": 4650
|
|
},
|
|
{
|
|
"entropy": 5.424197340011597,
|
|
"epoch": 0.4471661863592699,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004986064983868237,
|
|
"loss": 5.3095,
|
|
"mean_token_accuracy": 0.18436852544546128,
|
|
"num_tokens": 10670110.0,
|
|
"step": 4655
|
|
},
|
|
{
|
|
"entropy": 5.429648303985596,
|
|
"epoch": 0.44764649375600385,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004986026860991906,
|
|
"loss": 5.4385,
|
|
"mean_token_accuracy": 0.185771344602108,
|
|
"num_tokens": 10681255.0,
|
|
"step": 4660
|
|
},
|
|
{
|
|
"entropy": 5.471052789688111,
|
|
"epoch": 0.44812680115273773,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004985988686201672,
|
|
"loss": 5.5041,
|
|
"mean_token_accuracy": 0.1844386264681816,
|
|
"num_tokens": 10692631.0,
|
|
"step": 4665
|
|
},
|
|
{
|
|
"entropy": 5.442734622955323,
|
|
"epoch": 0.44860710854947167,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004985950459498419,
|
|
"loss": 5.3372,
|
|
"mean_token_accuracy": 0.19462240785360335,
|
|
"num_tokens": 10704880.0,
|
|
"step": 4670
|
|
},
|
|
{
|
|
"entropy": 5.390188550949096,
|
|
"epoch": 0.44908741594620555,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004985912180883037,
|
|
"loss": 5.3095,
|
|
"mean_token_accuracy": 0.19716786891222,
|
|
"num_tokens": 10715561.0,
|
|
"step": 4675
|
|
},
|
|
{
|
|
"entropy": 5.376702499389649,
|
|
"epoch": 0.4495677233429395,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004985873850356411,
|
|
"loss": 5.3369,
|
|
"mean_token_accuracy": 0.19014816135168075,
|
|
"num_tokens": 10727232.0,
|
|
"step": 4680
|
|
},
|
|
{
|
|
"entropy": 5.387975978851318,
|
|
"epoch": 0.45004803073967337,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004985835467919436,
|
|
"loss": 5.3461,
|
|
"mean_token_accuracy": 0.19422013461589813,
|
|
"num_tokens": 10739404.0,
|
|
"step": 4685
|
|
},
|
|
{
|
|
"entropy": 5.369897413253784,
|
|
"epoch": 0.4505283381364073,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004985797033572999,
|
|
"loss": 5.3767,
|
|
"mean_token_accuracy": 0.18446222841739654,
|
|
"num_tokens": 10751948.0,
|
|
"step": 4690
|
|
},
|
|
{
|
|
"entropy": 5.362226104736328,
|
|
"epoch": 0.4510086455331412,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004985758547317994,
|
|
"loss": 5.3363,
|
|
"mean_token_accuracy": 0.18433189690113067,
|
|
"num_tokens": 10764611.0,
|
|
"step": 4695
|
|
},
|
|
{
|
|
"entropy": 5.447867727279663,
|
|
"epoch": 0.4514889529298751,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004985720009155315,
|
|
"loss": 5.3727,
|
|
"mean_token_accuracy": 0.1841047078371048,
|
|
"num_tokens": 10775954.0,
|
|
"step": 4700
|
|
},
|
|
{
|
|
"entropy": 5.409327983856201,
|
|
"epoch": 0.45196926032660906,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004985681419085856,
|
|
"loss": 5.3909,
|
|
"mean_token_accuracy": 0.18282371312379836,
|
|
"num_tokens": 10788723.0,
|
|
"step": 4705
|
|
},
|
|
{
|
|
"entropy": 5.421317195892334,
|
|
"epoch": 0.45244956772334294,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004985642777110513,
|
|
"loss": 5.3841,
|
|
"mean_token_accuracy": 0.1885462448000908,
|
|
"num_tokens": 10799879.0,
|
|
"step": 4710
|
|
},
|
|
{
|
|
"entropy": 5.3301918506622314,
|
|
"epoch": 0.4529298751200769,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004985604083230183,
|
|
"loss": 5.3231,
|
|
"mean_token_accuracy": 0.18998679518699646,
|
|
"num_tokens": 10811838.0,
|
|
"step": 4715
|
|
},
|
|
{
|
|
"entropy": 5.428510332107544,
|
|
"epoch": 0.45341018251681076,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004985565337445765,
|
|
"loss": 5.3434,
|
|
"mean_token_accuracy": 0.19171882420778275,
|
|
"num_tokens": 10822910.0,
|
|
"step": 4720
|
|
},
|
|
{
|
|
"entropy": 5.471314573287964,
|
|
"epoch": 0.4538904899135447,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004985526539758158,
|
|
"loss": 5.3992,
|
|
"mean_token_accuracy": 0.18527638167142868,
|
|
"num_tokens": 10835344.0,
|
|
"step": 4725
|
|
},
|
|
{
|
|
"entropy": 5.375976181030273,
|
|
"epoch": 0.4543707973102786,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004985487690168263,
|
|
"loss": 5.4034,
|
|
"mean_token_accuracy": 0.19202104806900025,
|
|
"num_tokens": 10846043.0,
|
|
"step": 4730
|
|
},
|
|
{
|
|
"entropy": 5.380132484436035,
|
|
"epoch": 0.4548511047070125,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000498544878867698,
|
|
"loss": 5.298,
|
|
"mean_token_accuracy": 0.19829845130443574,
|
|
"num_tokens": 10857783.0,
|
|
"step": 4735
|
|
},
|
|
{
|
|
"entropy": 5.434480476379394,
|
|
"epoch": 0.4553314121037464,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004985409835285215,
|
|
"loss": 5.373,
|
|
"mean_token_accuracy": 0.19089124351739883,
|
|
"num_tokens": 10870527.0,
|
|
"step": 4740
|
|
},
|
|
{
|
|
"entropy": 5.414768075942993,
|
|
"epoch": 0.45581171950048033,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004985370829993873,
|
|
"loss": 5.3646,
|
|
"mean_token_accuracy": 0.19075230062007903,
|
|
"num_tokens": 10882285.0,
|
|
"step": 4745
|
|
},
|
|
{
|
|
"entropy": 5.423041200637817,
|
|
"epoch": 0.4562920268972142,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004985331772803857,
|
|
"loss": 5.3874,
|
|
"mean_token_accuracy": 0.19265468865633012,
|
|
"num_tokens": 10895319.0,
|
|
"step": 4750
|
|
},
|
|
{
|
|
"entropy": 5.484057378768921,
|
|
"epoch": 0.45677233429394815,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004985292663716074,
|
|
"loss": 5.382,
|
|
"mean_token_accuracy": 0.19183963984251023,
|
|
"num_tokens": 10906253.0,
|
|
"step": 4755
|
|
},
|
|
{
|
|
"entropy": 5.229197072982788,
|
|
"epoch": 0.457252641690682,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004985253502731435,
|
|
"loss": 5.2575,
|
|
"mean_token_accuracy": 0.19930023998022078,
|
|
"num_tokens": 10918197.0,
|
|
"step": 4760
|
|
},
|
|
{
|
|
"entropy": 5.455323648452759,
|
|
"epoch": 0.45773294908741596,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004985214289850845,
|
|
"loss": 5.4579,
|
|
"mean_token_accuracy": 0.17997599244117737,
|
|
"num_tokens": 10930771.0,
|
|
"step": 4765
|
|
},
|
|
{
|
|
"entropy": 5.443937206268311,
|
|
"epoch": 0.45821325648414984,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004985175025075217,
|
|
"loss": 5.3491,
|
|
"mean_token_accuracy": 0.18804308474063874,
|
|
"num_tokens": 10942759.0,
|
|
"step": 4770
|
|
},
|
|
{
|
|
"entropy": 5.591840028762817,
|
|
"epoch": 0.4586935638808838,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004985135708405462,
|
|
"loss": 5.5609,
|
|
"mean_token_accuracy": 0.17564835995435715,
|
|
"num_tokens": 10953557.0,
|
|
"step": 4775
|
|
},
|
|
{
|
|
"entropy": 5.411443281173706,
|
|
"epoch": 0.45917387127761766,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004985096339842493,
|
|
"loss": 5.3321,
|
|
"mean_token_accuracy": 0.19676847159862518,
|
|
"num_tokens": 10963142.0,
|
|
"step": 4780
|
|
},
|
|
{
|
|
"entropy": 5.309838056564331,
|
|
"epoch": 0.4596541786743516,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004985056919387224,
|
|
"loss": 5.2856,
|
|
"mean_token_accuracy": 0.19894758760929107,
|
|
"num_tokens": 10974321.0,
|
|
"step": 4785
|
|
},
|
|
{
|
|
"entropy": 5.502527189254761,
|
|
"epoch": 0.4601344860710855,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004985017447040569,
|
|
"loss": 5.4874,
|
|
"mean_token_accuracy": 0.18695860356092453,
|
|
"num_tokens": 10985524.0,
|
|
"step": 4790
|
|
},
|
|
{
|
|
"entropy": 5.457700490951538,
|
|
"epoch": 0.4606147934678194,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004984977922803447,
|
|
"loss": 5.3727,
|
|
"mean_token_accuracy": 0.1937094435095787,
|
|
"num_tokens": 10997606.0,
|
|
"step": 4795
|
|
},
|
|
{
|
|
"entropy": 5.4323536396026615,
|
|
"epoch": 0.4610951008645533,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004984938346676772,
|
|
"loss": 5.3833,
|
|
"mean_token_accuracy": 0.18257274031639098,
|
|
"num_tokens": 11010692.0,
|
|
"step": 4800
|
|
},
|
|
{
|
|
"entropy": 5.40803747177124,
|
|
"epoch": 0.46157540826128723,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004984898718661468,
|
|
"loss": 5.3099,
|
|
"mean_token_accuracy": 0.19199058413505554,
|
|
"num_tokens": 11022517.0,
|
|
"step": 4805
|
|
},
|
|
{
|
|
"entropy": 5.350576591491699,
|
|
"epoch": 0.4620557156580211,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004984859038758451,
|
|
"loss": 5.3253,
|
|
"mean_token_accuracy": 0.19188573807477952,
|
|
"num_tokens": 11033141.0,
|
|
"step": 4810
|
|
},
|
|
{
|
|
"entropy": 5.32304048538208,
|
|
"epoch": 0.46253602305475505,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004984819306968642,
|
|
"loss": 5.3173,
|
|
"mean_token_accuracy": 0.19185021072626113,
|
|
"num_tokens": 11044619.0,
|
|
"step": 4815
|
|
},
|
|
{
|
|
"entropy": 5.495067167282104,
|
|
"epoch": 0.46301633045148893,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004984779523292966,
|
|
"loss": 5.3646,
|
|
"mean_token_accuracy": 0.18967657685279846,
|
|
"num_tokens": 11055934.0,
|
|
"step": 4820
|
|
},
|
|
{
|
|
"entropy": 5.383758926391602,
|
|
"epoch": 0.46349663784822287,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004984739687732345,
|
|
"loss": 5.2493,
|
|
"mean_token_accuracy": 0.19513811767101288,
|
|
"num_tokens": 11066203.0,
|
|
"step": 4825
|
|
},
|
|
{
|
|
"entropy": 5.187354946136475,
|
|
"epoch": 0.46397694524495675,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004984699800287705,
|
|
"loss": 5.1973,
|
|
"mean_token_accuracy": 0.19977913796901703,
|
|
"num_tokens": 11079664.0,
|
|
"step": 4830
|
|
},
|
|
{
|
|
"entropy": 5.341605234146118,
|
|
"epoch": 0.4644572526416907,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.000498465986095997,
|
|
"loss": 5.2652,
|
|
"mean_token_accuracy": 0.19821466654539108,
|
|
"num_tokens": 11091186.0,
|
|
"step": 4835
|
|
},
|
|
{
|
|
"entropy": 5.42094578742981,
|
|
"epoch": 0.46493756003842457,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004984619869750069,
|
|
"loss": 5.383,
|
|
"mean_token_accuracy": 0.18526540249586104,
|
|
"num_tokens": 11102710.0,
|
|
"step": 4840
|
|
},
|
|
{
|
|
"entropy": 5.292195415496826,
|
|
"epoch": 0.4654178674351585,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000498457982665893,
|
|
"loss": 5.2795,
|
|
"mean_token_accuracy": 0.19302588403224946,
|
|
"num_tokens": 11114746.0,
|
|
"step": 4845
|
|
},
|
|
{
|
|
"entropy": 5.397561931610108,
|
|
"epoch": 0.4658981748318924,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004984539731687483,
|
|
"loss": 5.3462,
|
|
"mean_token_accuracy": 0.18983854949474335,
|
|
"num_tokens": 11126572.0,
|
|
"step": 4850
|
|
},
|
|
{
|
|
"entropy": 5.380267095565796,
|
|
"epoch": 0.4663784822286263,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004984499584836659,
|
|
"loss": 5.2431,
|
|
"mean_token_accuracy": 0.19321491122245787,
|
|
"num_tokens": 11137830.0,
|
|
"step": 4855
|
|
},
|
|
{
|
|
"entropy": 5.32379674911499,
|
|
"epoch": 0.4668587896253602,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000498445938610739,
|
|
"loss": 5.281,
|
|
"mean_token_accuracy": 0.19294328689575196,
|
|
"num_tokens": 11148860.0,
|
|
"step": 4860
|
|
},
|
|
{
|
|
"entropy": 5.419743824005127,
|
|
"epoch": 0.46733909702209414,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004984419135500608,
|
|
"loss": 5.4081,
|
|
"mean_token_accuracy": 0.17859717160463334,
|
|
"num_tokens": 11161311.0,
|
|
"step": 4865
|
|
},
|
|
{
|
|
"entropy": 5.430191612243652,
|
|
"epoch": 0.4678194044188281,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004984378833017249,
|
|
"loss": 5.2942,
|
|
"mean_token_accuracy": 0.19046030193567276,
|
|
"num_tokens": 11173124.0,
|
|
"step": 4870
|
|
},
|
|
{
|
|
"entropy": 5.344765472412109,
|
|
"epoch": 0.46829971181556196,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004984338478658248,
|
|
"loss": 5.3783,
|
|
"mean_token_accuracy": 0.19164984971284865,
|
|
"num_tokens": 11184879.0,
|
|
"step": 4875
|
|
},
|
|
{
|
|
"entropy": 5.45609302520752,
|
|
"epoch": 0.4687800192122959,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004984298072424542,
|
|
"loss": 5.378,
|
|
"mean_token_accuracy": 0.1874854624271393,
|
|
"num_tokens": 11196243.0,
|
|
"step": 4880
|
|
},
|
|
{
|
|
"entropy": 5.339529609680175,
|
|
"epoch": 0.4692603266090298,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000498425761431707,
|
|
"loss": 5.2513,
|
|
"mean_token_accuracy": 0.20040780752897264,
|
|
"num_tokens": 11207485.0,
|
|
"step": 4885
|
|
},
|
|
{
|
|
"entropy": 5.312271356582642,
|
|
"epoch": 0.4697406340057637,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000498421710433677,
|
|
"loss": 5.279,
|
|
"mean_token_accuracy": 0.19036460667848587,
|
|
"num_tokens": 11219891.0,
|
|
"step": 4890
|
|
},
|
|
{
|
|
"entropy": 5.4914182186126705,
|
|
"epoch": 0.4702209414024976,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004984176542484584,
|
|
"loss": 5.388,
|
|
"mean_token_accuracy": 0.18597144782543182,
|
|
"num_tokens": 11231329.0,
|
|
"step": 4895
|
|
},
|
|
{
|
|
"entropy": 5.378525733947754,
|
|
"epoch": 0.47070124879923153,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004984135928761452,
|
|
"loss": 5.266,
|
|
"mean_token_accuracy": 0.1995886370539665,
|
|
"num_tokens": 11241367.0,
|
|
"step": 4900
|
|
},
|
|
{
|
|
"entropy": 5.358568334579468,
|
|
"epoch": 0.4711815561959654,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004984095263168317,
|
|
"loss": 5.3589,
|
|
"mean_token_accuracy": 0.18466073721647264,
|
|
"num_tokens": 11254532.0,
|
|
"step": 4905
|
|
},
|
|
{
|
|
"entropy": 5.4979103088378904,
|
|
"epoch": 0.47166186359269935,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004984054545706124,
|
|
"loss": 5.4398,
|
|
"mean_token_accuracy": 0.18243181705474854,
|
|
"num_tokens": 11265223.0,
|
|
"step": 4910
|
|
},
|
|
{
|
|
"entropy": 5.3696846008300785,
|
|
"epoch": 0.47214217098943323,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000498401377637582,
|
|
"loss": 5.3635,
|
|
"mean_token_accuracy": 0.18885526210069656,
|
|
"num_tokens": 11278228.0,
|
|
"step": 4915
|
|
},
|
|
{
|
|
"entropy": 5.484466791152954,
|
|
"epoch": 0.47262247838616717,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000498397295517835,
|
|
"loss": 5.4846,
|
|
"mean_token_accuracy": 0.1801117405295372,
|
|
"num_tokens": 11289654.0,
|
|
"step": 4920
|
|
},
|
|
{
|
|
"entropy": 5.394139242172241,
|
|
"epoch": 0.47310278578290105,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004983932082114659,
|
|
"loss": 5.2357,
|
|
"mean_token_accuracy": 0.19755308330059052,
|
|
"num_tokens": 11301911.0,
|
|
"step": 4925
|
|
},
|
|
{
|
|
"entropy": 5.4873377799987795,
|
|
"epoch": 0.473583093179635,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004983891157185699,
|
|
"loss": 5.4364,
|
|
"mean_token_accuracy": 0.18308536261320113,
|
|
"num_tokens": 11312945.0,
|
|
"step": 4930
|
|
},
|
|
{
|
|
"entropy": 5.549541664123535,
|
|
"epoch": 0.47406340057636887,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004983850180392421,
|
|
"loss": 5.4774,
|
|
"mean_token_accuracy": 0.18022425770759581,
|
|
"num_tokens": 11324126.0,
|
|
"step": 4935
|
|
},
|
|
{
|
|
"entropy": 5.402717351913452,
|
|
"epoch": 0.4745437079731028,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004983809151735775,
|
|
"loss": 5.4133,
|
|
"mean_token_accuracy": 0.18017226606607437,
|
|
"num_tokens": 11336395.0,
|
|
"step": 4940
|
|
},
|
|
{
|
|
"entropy": 5.403596019744873,
|
|
"epoch": 0.4750240153698367,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004983768071216713,
|
|
"loss": 5.3135,
|
|
"mean_token_accuracy": 0.1902969852089882,
|
|
"num_tokens": 11347387.0,
|
|
"step": 4945
|
|
},
|
|
{
|
|
"entropy": 5.353836917877198,
|
|
"epoch": 0.4755043227665706,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004983726938836189,
|
|
"loss": 5.308,
|
|
"mean_token_accuracy": 0.19681546241044998,
|
|
"num_tokens": 11358467.0,
|
|
"step": 4950
|
|
},
|
|
{
|
|
"entropy": 5.486645841598511,
|
|
"epoch": 0.4759846301633045,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004983685754595159,
|
|
"loss": 5.4724,
|
|
"mean_token_accuracy": 0.18010423183441163,
|
|
"num_tokens": 11370322.0,
|
|
"step": 4955
|
|
},
|
|
{
|
|
"entropy": 5.333859491348266,
|
|
"epoch": 0.47646493756003844,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004983644518494578,
|
|
"loss": 5.2697,
|
|
"mean_token_accuracy": 0.20096147507429124,
|
|
"num_tokens": 11381719.0,
|
|
"step": 4960
|
|
},
|
|
{
|
|
"entropy": 5.328320550918579,
|
|
"epoch": 0.4769452449567723,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004983603230535403,
|
|
"loss": 5.2895,
|
|
"mean_token_accuracy": 0.1948627695441246,
|
|
"num_tokens": 11393561.0,
|
|
"step": 4965
|
|
},
|
|
{
|
|
"entropy": 5.460376167297364,
|
|
"epoch": 0.47742555235350626,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004983561890718594,
|
|
"loss": 5.3849,
|
|
"mean_token_accuracy": 0.18933912962675095,
|
|
"num_tokens": 11405411.0,
|
|
"step": 4970
|
|
},
|
|
{
|
|
"entropy": 5.5110303401947025,
|
|
"epoch": 0.47790585975024014,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.000498352049904511,
|
|
"loss": 5.4771,
|
|
"mean_token_accuracy": 0.17981591820716858,
|
|
"num_tokens": 11417419.0,
|
|
"step": 4975
|
|
},
|
|
{
|
|
"entropy": 5.429950714111328,
|
|
"epoch": 0.4783861671469741,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004983479055515914,
|
|
"loss": 5.2844,
|
|
"mean_token_accuracy": 0.18997065275907515,
|
|
"num_tokens": 11428145.0,
|
|
"step": 4980
|
|
},
|
|
{
|
|
"entropy": 5.290281534194946,
|
|
"epoch": 0.47886647454370795,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004983437560131964,
|
|
"loss": 5.2422,
|
|
"mean_token_accuracy": 0.1993091583251953,
|
|
"num_tokens": 11439224.0,
|
|
"step": 4985
|
|
},
|
|
{
|
|
"entropy": 5.409195756912231,
|
|
"epoch": 0.4793467819404419,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004983396012894228,
|
|
"loss": 5.3477,
|
|
"mean_token_accuracy": 0.18979695290327073,
|
|
"num_tokens": 11451731.0,
|
|
"step": 4990
|
|
},
|
|
{
|
|
"entropy": 5.435146522521973,
|
|
"epoch": 0.47982708933717577,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004983354413803666,
|
|
"loss": 5.3375,
|
|
"mean_token_accuracy": 0.1958609476685524,
|
|
"num_tokens": 11463058.0,
|
|
"step": 4995
|
|
},
|
|
{
|
|
"entropy": 5.473912382125855,
|
|
"epoch": 0.4803073967339097,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004983312762861248,
|
|
"loss": 5.4305,
|
|
"mean_token_accuracy": 0.18449530750513077,
|
|
"num_tokens": 11472618.0,
|
|
"step": 5000
|
|
},
|
|
{
|
|
"entropy": 5.364778709411621,
|
|
"epoch": 0.4807877041306436,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004983271060067939,
|
|
"loss": 5.3246,
|
|
"mean_token_accuracy": 0.18677808940410615,
|
|
"num_tokens": 11483114.0,
|
|
"step": 5005
|
|
},
|
|
{
|
|
"entropy": 5.3417730808258055,
|
|
"epoch": 0.4812680115273775,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004983229305424707,
|
|
"loss": 5.2799,
|
|
"mean_token_accuracy": 0.19405496269464492,
|
|
"num_tokens": 11494281.0,
|
|
"step": 5010
|
|
},
|
|
{
|
|
"entropy": 5.351672601699829,
|
|
"epoch": 0.4817483189241114,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004983187498932522,
|
|
"loss": 5.3503,
|
|
"mean_token_accuracy": 0.18800514042377472,
|
|
"num_tokens": 11505962.0,
|
|
"step": 5015
|
|
},
|
|
{
|
|
"entropy": 5.4874766826629635,
|
|
"epoch": 0.48222862632084534,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004983145640592354,
|
|
"loss": 5.4492,
|
|
"mean_token_accuracy": 0.18352760821580888,
|
|
"num_tokens": 11517558.0,
|
|
"step": 5020
|
|
},
|
|
{
|
|
"entropy": 5.448751974105835,
|
|
"epoch": 0.4827089337175792,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004983103730405176,
|
|
"loss": 5.4179,
|
|
"mean_token_accuracy": 0.18682138621807098,
|
|
"num_tokens": 11529184.0,
|
|
"step": 5025
|
|
},
|
|
{
|
|
"entropy": 5.338459253311157,
|
|
"epoch": 0.48318924111431316,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000498306176837196,
|
|
"loss": 5.3335,
|
|
"mean_token_accuracy": 0.18406548202037812,
|
|
"num_tokens": 11540727.0,
|
|
"step": 5030
|
|
},
|
|
{
|
|
"entropy": 5.360374689102173,
|
|
"epoch": 0.48366954851104704,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004983019754493681,
|
|
"loss": 5.261,
|
|
"mean_token_accuracy": 0.1907915487885475,
|
|
"num_tokens": 11551510.0,
|
|
"step": 5035
|
|
},
|
|
{
|
|
"entropy": 5.47594895362854,
|
|
"epoch": 0.484149855907781,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004982977688771314,
|
|
"loss": 5.4187,
|
|
"mean_token_accuracy": 0.18854755759239197,
|
|
"num_tokens": 11563203.0,
|
|
"step": 5040
|
|
},
|
|
{
|
|
"entropy": 5.308377647399903,
|
|
"epoch": 0.4846301633045149,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004982935571205835,
|
|
"loss": 5.2718,
|
|
"mean_token_accuracy": 0.19544857442379,
|
|
"num_tokens": 11576013.0,
|
|
"step": 5045
|
|
},
|
|
{
|
|
"entropy": 5.291185140609741,
|
|
"epoch": 0.4851104707012488,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004982893401798223,
|
|
"loss": 5.2498,
|
|
"mean_token_accuracy": 0.20830876976251603,
|
|
"num_tokens": 11587535.0,
|
|
"step": 5050
|
|
},
|
|
{
|
|
"entropy": 5.403550291061402,
|
|
"epoch": 0.48559077809798273,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004982851180549456,
|
|
"loss": 5.2771,
|
|
"mean_token_accuracy": 0.19294197112321854,
|
|
"num_tokens": 11598487.0,
|
|
"step": 5055
|
|
},
|
|
{
|
|
"entropy": 5.25755033493042,
|
|
"epoch": 0.4860710854947166,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004982808907460515,
|
|
"loss": 5.1559,
|
|
"mean_token_accuracy": 0.20932556241750716,
|
|
"num_tokens": 11609457.0,
|
|
"step": 5060
|
|
},
|
|
{
|
|
"entropy": 5.265308237075805,
|
|
"epoch": 0.48655139289145055,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004982766582532382,
|
|
"loss": 5.2257,
|
|
"mean_token_accuracy": 0.19795275181531907,
|
|
"num_tokens": 11620251.0,
|
|
"step": 5065
|
|
},
|
|
{
|
|
"entropy": 5.307956266403198,
|
|
"epoch": 0.48703170028818443,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004982724205766038,
|
|
"loss": 5.2262,
|
|
"mean_token_accuracy": 0.19880327582359314,
|
|
"num_tokens": 11630956.0,
|
|
"step": 5070
|
|
},
|
|
{
|
|
"entropy": 5.348564767837525,
|
|
"epoch": 0.48751200768491837,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0004982681777162468,
|
|
"loss": 5.2773,
|
|
"mean_token_accuracy": 0.1949208691716194,
|
|
"num_tokens": 11642560.0,
|
|
"step": 5075
|
|
},
|
|
{
|
|
"entropy": 5.300316572189331,
|
|
"epoch": 0.48799231508165225,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004982639296722657,
|
|
"loss": 5.2365,
|
|
"mean_token_accuracy": 0.19546635299921036,
|
|
"num_tokens": 11654050.0,
|
|
"step": 5080
|
|
},
|
|
{
|
|
"entropy": 5.333183813095093,
|
|
"epoch": 0.4884726224783862,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004982596764447591,
|
|
"loss": 5.4035,
|
|
"mean_token_accuracy": 0.19310665130615234,
|
|
"num_tokens": 11664947.0,
|
|
"step": 5085
|
|
},
|
|
{
|
|
"entropy": 5.469000768661499,
|
|
"epoch": 0.48895292987512007,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004982554180338258,
|
|
"loss": 5.3106,
|
|
"mean_token_accuracy": 0.19500951319932938,
|
|
"num_tokens": 11676927.0,
|
|
"step": 5090
|
|
},
|
|
{
|
|
"entropy": 5.502379417419434,
|
|
"epoch": 0.489433237271854,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004982511544395646,
|
|
"loss": 5.4242,
|
|
"mean_token_accuracy": 0.18115128874778746,
|
|
"num_tokens": 11688573.0,
|
|
"step": 5095
|
|
},
|
|
{
|
|
"entropy": 5.288805294036865,
|
|
"epoch": 0.4899135446685879,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004982468856620745,
|
|
"loss": 5.3128,
|
|
"mean_token_accuracy": 0.18783441036939622,
|
|
"num_tokens": 11698704.0,
|
|
"step": 5100
|
|
},
|
|
{
|
|
"entropy": 5.3273578643798825,
|
|
"epoch": 0.4903938520653218,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004982426117014545,
|
|
"loss": 5.2533,
|
|
"mean_token_accuracy": 0.19392533451318741,
|
|
"num_tokens": 11709466.0,
|
|
"step": 5105
|
|
},
|
|
{
|
|
"entropy": 5.3791663646698,
|
|
"epoch": 0.4908741594620557,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004982383325578041,
|
|
"loss": 5.3413,
|
|
"mean_token_accuracy": 0.1898537114262581,
|
|
"num_tokens": 11721120.0,
|
|
"step": 5110
|
|
},
|
|
{
|
|
"entropy": 5.4256843566894535,
|
|
"epoch": 0.49135446685878964,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004982340482312226,
|
|
"loss": 5.3358,
|
|
"mean_token_accuracy": 0.18456312417984008,
|
|
"num_tokens": 11732120.0,
|
|
"step": 5115
|
|
},
|
|
{
|
|
"entropy": 5.288364553451538,
|
|
"epoch": 0.4918347742555235,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004982297587218092,
|
|
"loss": 5.2294,
|
|
"mean_token_accuracy": 0.1978309139609337,
|
|
"num_tokens": 11743501.0,
|
|
"step": 5120
|
|
},
|
|
{
|
|
"entropy": 5.363348197937012,
|
|
"epoch": 0.49231508165225746,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004982254640296637,
|
|
"loss": 5.3152,
|
|
"mean_token_accuracy": 0.1956743210554123,
|
|
"num_tokens": 11755051.0,
|
|
"step": 5125
|
|
},
|
|
{
|
|
"entropy": 5.436681079864502,
|
|
"epoch": 0.49279538904899134,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004982211641548857,
|
|
"loss": 5.4609,
|
|
"mean_token_accuracy": 0.1842927649617195,
|
|
"num_tokens": 11767663.0,
|
|
"step": 5130
|
|
},
|
|
{
|
|
"entropy": 5.419048309326172,
|
|
"epoch": 0.4932756964457253,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004982168590975752,
|
|
"loss": 5.3034,
|
|
"mean_token_accuracy": 0.19774986803531647,
|
|
"num_tokens": 11778828.0,
|
|
"step": 5135
|
|
},
|
|
{
|
|
"entropy": 5.459513902664185,
|
|
"epoch": 0.49375600384245916,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004982125488578321,
|
|
"loss": 5.4794,
|
|
"mean_token_accuracy": 0.18496931344270706,
|
|
"num_tokens": 11790654.0,
|
|
"step": 5140
|
|
},
|
|
{
|
|
"entropy": 5.433895540237427,
|
|
"epoch": 0.4942363112391931,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004982082334357563,
|
|
"loss": 5.2837,
|
|
"mean_token_accuracy": 0.1902835488319397,
|
|
"num_tokens": 11801489.0,
|
|
"step": 5145
|
|
},
|
|
{
|
|
"entropy": 5.311564207077026,
|
|
"epoch": 0.494716618635927,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004982039128314481,
|
|
"loss": 5.2873,
|
|
"mean_token_accuracy": 0.19224448949098588,
|
|
"num_tokens": 11813818.0,
|
|
"step": 5150
|
|
},
|
|
{
|
|
"entropy": 5.333755207061768,
|
|
"epoch": 0.4951969260326609,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004981995870450079,
|
|
"loss": 5.2929,
|
|
"mean_token_accuracy": 0.191859370470047,
|
|
"num_tokens": 11824814.0,
|
|
"step": 5155
|
|
},
|
|
{
|
|
"entropy": 5.45896692276001,
|
|
"epoch": 0.4956772334293948,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004981952560765361,
|
|
"loss": 5.3373,
|
|
"mean_token_accuracy": 0.18679553270339966,
|
|
"num_tokens": 11836252.0,
|
|
"step": 5160
|
|
},
|
|
{
|
|
"entropy": 5.314207363128662,
|
|
"epoch": 0.49615754082612873,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004981909199261331,
|
|
"loss": 5.2629,
|
|
"mean_token_accuracy": 0.19086166322231293,
|
|
"num_tokens": 11847715.0,
|
|
"step": 5165
|
|
},
|
|
{
|
|
"entropy": 5.273135042190551,
|
|
"epoch": 0.4966378482228626,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004981865785938998,
|
|
"loss": 5.2629,
|
|
"mean_token_accuracy": 0.19300127327442168,
|
|
"num_tokens": 11860309.0,
|
|
"step": 5170
|
|
},
|
|
{
|
|
"entropy": 5.348716497421265,
|
|
"epoch": 0.49711815561959655,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004981822320799367,
|
|
"loss": 5.2577,
|
|
"mean_token_accuracy": 0.1956932559609413,
|
|
"num_tokens": 11872569.0,
|
|
"step": 5175
|
|
},
|
|
{
|
|
"entropy": 5.3287012577056885,
|
|
"epoch": 0.49759846301633043,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004981778803843449,
|
|
"loss": 5.2523,
|
|
"mean_token_accuracy": 0.19481286704540252,
|
|
"num_tokens": 11884778.0,
|
|
"step": 5180
|
|
},
|
|
{
|
|
"entropy": 5.390296173095703,
|
|
"epoch": 0.49807877041306436,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004981735235072256,
|
|
"loss": 5.3358,
|
|
"mean_token_accuracy": 0.1911753833293915,
|
|
"num_tokens": 11897324.0,
|
|
"step": 5185
|
|
},
|
|
{
|
|
"entropy": 5.467144203186035,
|
|
"epoch": 0.49855907780979825,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004981691614486796,
|
|
"loss": 5.366,
|
|
"mean_token_accuracy": 0.18982964605093003,
|
|
"num_tokens": 11909145.0,
|
|
"step": 5190
|
|
},
|
|
{
|
|
"entropy": 5.322554683685302,
|
|
"epoch": 0.4990393852065322,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004981647942088084,
|
|
"loss": 5.2697,
|
|
"mean_token_accuracy": 0.20009808093309403,
|
|
"num_tokens": 11921021.0,
|
|
"step": 5195
|
|
},
|
|
{
|
|
"entropy": 5.487699699401856,
|
|
"epoch": 0.49951969260326606,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004981604217877135,
|
|
"loss": 5.4279,
|
|
"mean_token_accuracy": 0.1888749822974205,
|
|
"num_tokens": 11932565.0,
|
|
"step": 5200
|
|
},
|
|
{
|
|
"entropy": 5.318529844284058,
|
|
"epoch": 0.5,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000498156044185496,
|
|
"loss": 5.3392,
|
|
"mean_token_accuracy": 0.19370948225259782,
|
|
"num_tokens": 11943225.0,
|
|
"step": 5205
|
|
},
|
|
{
|
|
"entropy": 5.364103078842163,
|
|
"epoch": 0.5004803073967339,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004981516614022579,
|
|
"loss": 5.3219,
|
|
"mean_token_accuracy": 0.1932568922638893,
|
|
"num_tokens": 11954821.0,
|
|
"step": 5210
|
|
},
|
|
{
|
|
"entropy": 5.446450281143188,
|
|
"epoch": 0.5009606147934679,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004981472734381008,
|
|
"loss": 5.2738,
|
|
"mean_token_accuracy": 0.1951069414615631,
|
|
"num_tokens": 11966090.0,
|
|
"step": 5215
|
|
},
|
|
{
|
|
"entropy": 5.353061962127685,
|
|
"epoch": 0.5014409221902018,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004981428802931267,
|
|
"loss": 5.3074,
|
|
"mean_token_accuracy": 0.1921882688999176,
|
|
"num_tokens": 11977410.0,
|
|
"step": 5220
|
|
},
|
|
{
|
|
"entropy": 5.339950656890869,
|
|
"epoch": 0.5019212295869356,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004981384819674375,
|
|
"loss": 5.2841,
|
|
"mean_token_accuracy": 0.19126271605491638,
|
|
"num_tokens": 11989119.0,
|
|
"step": 5225
|
|
},
|
|
{
|
|
"entropy": 5.432912015914917,
|
|
"epoch": 0.5024015369836695,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004981340784611354,
|
|
"loss": 5.3942,
|
|
"mean_token_accuracy": 0.19018032401800156,
|
|
"num_tokens": 12000165.0,
|
|
"step": 5230
|
|
},
|
|
{
|
|
"entropy": 5.395741987228393,
|
|
"epoch": 0.5028818443804035,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004981296697743224,
|
|
"loss": 5.3475,
|
|
"mean_token_accuracy": 0.18768104463815688,
|
|
"num_tokens": 12012118.0,
|
|
"step": 5235
|
|
},
|
|
{
|
|
"entropy": 5.430673694610595,
|
|
"epoch": 0.5033621517771374,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004981252559071012,
|
|
"loss": 5.4181,
|
|
"mean_token_accuracy": 0.1866712138056755,
|
|
"num_tokens": 12023432.0,
|
|
"step": 5240
|
|
},
|
|
{
|
|
"entropy": 5.427559089660645,
|
|
"epoch": 0.5038424591738713,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004981208368595739,
|
|
"loss": 5.2939,
|
|
"mean_token_accuracy": 0.1980261042714119,
|
|
"num_tokens": 12034323.0,
|
|
"step": 5245
|
|
},
|
|
{
|
|
"entropy": 5.264776802062988,
|
|
"epoch": 0.5043227665706052,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004981164126318435,
|
|
"loss": 5.3022,
|
|
"mean_token_accuracy": 0.19116167575120926,
|
|
"num_tokens": 12045532.0,
|
|
"step": 5250
|
|
},
|
|
{
|
|
"entropy": 5.449652862548828,
|
|
"epoch": 0.5048030739673391,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004981119832240124,
|
|
"loss": 5.3111,
|
|
"mean_token_accuracy": 0.19520313441753387,
|
|
"num_tokens": 12057346.0,
|
|
"step": 5255
|
|
},
|
|
{
|
|
"entropy": 5.301677227020264,
|
|
"epoch": 0.505283381364073,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004981075486361837,
|
|
"loss": 5.2825,
|
|
"mean_token_accuracy": 0.19872631430625914,
|
|
"num_tokens": 12068670.0,
|
|
"step": 5260
|
|
},
|
|
{
|
|
"entropy": 5.390146923065186,
|
|
"epoch": 0.5057636887608069,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004981031088684601,
|
|
"loss": 5.4028,
|
|
"mean_token_accuracy": 0.18470921665430068,
|
|
"num_tokens": 12079664.0,
|
|
"step": 5265
|
|
},
|
|
{
|
|
"entropy": 5.474726438522339,
|
|
"epoch": 0.5062439961575408,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004980986639209448,
|
|
"loss": 5.3285,
|
|
"mean_token_accuracy": 0.1994831383228302,
|
|
"num_tokens": 12089984.0,
|
|
"step": 5270
|
|
},
|
|
{
|
|
"entropy": 5.29730339050293,
|
|
"epoch": 0.5067243035542748,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000498094213793741,
|
|
"loss": 5.2835,
|
|
"mean_token_accuracy": 0.1948940023779869,
|
|
"num_tokens": 12101182.0,
|
|
"step": 5275
|
|
},
|
|
{
|
|
"entropy": 5.408280658721924,
|
|
"epoch": 0.5072046109510087,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000498089758486952,
|
|
"loss": 5.353,
|
|
"mean_token_accuracy": 0.18289182782173158,
|
|
"num_tokens": 12112002.0,
|
|
"step": 5280
|
|
},
|
|
{
|
|
"entropy": 5.495666790008545,
|
|
"epoch": 0.5076849183477425,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004980852980006812,
|
|
"loss": 5.4392,
|
|
"mean_token_accuracy": 0.1805154114961624,
|
|
"num_tokens": 12124194.0,
|
|
"step": 5285
|
|
},
|
|
{
|
|
"entropy": 5.392632579803466,
|
|
"epoch": 0.5081652257444764,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004980808323350323,
|
|
"loss": 5.359,
|
|
"mean_token_accuracy": 0.1960368499159813,
|
|
"num_tokens": 12133966.0,
|
|
"step": 5290
|
|
},
|
|
{
|
|
"entropy": 5.391989612579346,
|
|
"epoch": 0.5086455331412104,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004980763614901089,
|
|
"loss": 5.2967,
|
|
"mean_token_accuracy": 0.19686038345098494,
|
|
"num_tokens": 12145643.0,
|
|
"step": 5295
|
|
},
|
|
{
|
|
"entropy": 5.379247760772705,
|
|
"epoch": 0.5091258405379443,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004980718854660146,
|
|
"loss": 5.3464,
|
|
"mean_token_accuracy": 0.18789971768856048,
|
|
"num_tokens": 12156804.0,
|
|
"step": 5300
|
|
},
|
|
{
|
|
"entropy": 5.400803756713867,
|
|
"epoch": 0.5096061479346782,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004980674042628537,
|
|
"loss": 5.2967,
|
|
"mean_token_accuracy": 0.19052283465862274,
|
|
"num_tokens": 12168700.0,
|
|
"step": 5305
|
|
},
|
|
{
|
|
"entropy": 5.401619243621826,
|
|
"epoch": 0.5100864553314121,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00049806291788073,
|
|
"loss": 5.3123,
|
|
"mean_token_accuracy": 0.18629832863807677,
|
|
"num_tokens": 12181050.0,
|
|
"step": 5310
|
|
},
|
|
{
|
|
"entropy": 5.469602966308594,
|
|
"epoch": 0.510566762728146,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004980584263197477,
|
|
"loss": 5.3949,
|
|
"mean_token_accuracy": 0.1858072027564049,
|
|
"num_tokens": 12192001.0,
|
|
"step": 5315
|
|
},
|
|
{
|
|
"entropy": 5.508568143844604,
|
|
"epoch": 0.5110470701248799,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004980539295800111,
|
|
"loss": 5.509,
|
|
"mean_token_accuracy": 0.18043418526649474,
|
|
"num_tokens": 12202436.0,
|
|
"step": 5320
|
|
},
|
|
{
|
|
"entropy": 5.362590551376343,
|
|
"epoch": 0.5115273775216138,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004980494276616246,
|
|
"loss": 5.3016,
|
|
"mean_token_accuracy": 0.18966611623764038,
|
|
"num_tokens": 12214454.0,
|
|
"step": 5325
|
|
},
|
|
{
|
|
"entropy": 5.349428033828735,
|
|
"epoch": 0.5120076849183477,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004980449205646926,
|
|
"loss": 5.3122,
|
|
"mean_token_accuracy": 0.19553214311599731,
|
|
"num_tokens": 12225924.0,
|
|
"step": 5330
|
|
},
|
|
{
|
|
"entropy": 5.415020084381103,
|
|
"epoch": 0.5124879923150817,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00049804040828932,
|
|
"loss": 5.3326,
|
|
"mean_token_accuracy": 0.19512139409780502,
|
|
"num_tokens": 12236456.0,
|
|
"step": 5335
|
|
},
|
|
{
|
|
"entropy": 5.421989011764526,
|
|
"epoch": 0.5129682997118156,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004980358908356113,
|
|
"loss": 5.3535,
|
|
"mean_token_accuracy": 0.18762658089399337,
|
|
"num_tokens": 12247719.0,
|
|
"step": 5340
|
|
},
|
|
{
|
|
"entropy": 5.350346803665161,
|
|
"epoch": 0.5134486071085494,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004980313682036717,
|
|
"loss": 5.381,
|
|
"mean_token_accuracy": 0.1927213490009308,
|
|
"num_tokens": 12259141.0,
|
|
"step": 5345
|
|
},
|
|
{
|
|
"entropy": 5.49134635925293,
|
|
"epoch": 0.5139289145052833,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004980268403936058,
|
|
"loss": 5.4456,
|
|
"mean_token_accuracy": 0.18453603684902192,
|
|
"num_tokens": 12269748.0,
|
|
"step": 5350
|
|
},
|
|
{
|
|
"entropy": 5.434391784667969,
|
|
"epoch": 0.5144092219020173,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004980223074055189,
|
|
"loss": 5.379,
|
|
"mean_token_accuracy": 0.1960138276219368,
|
|
"num_tokens": 12281456.0,
|
|
"step": 5355
|
|
},
|
|
{
|
|
"entropy": 5.409012746810913,
|
|
"epoch": 0.5148895292987512,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004980177692395164,
|
|
"loss": 5.3518,
|
|
"mean_token_accuracy": 0.18338604271411896,
|
|
"num_tokens": 12293763.0,
|
|
"step": 5360
|
|
},
|
|
{
|
|
"entropy": 5.351993417739868,
|
|
"epoch": 0.5153698366954851,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004980132258957035,
|
|
"loss": 5.2808,
|
|
"mean_token_accuracy": 0.1969463735818863,
|
|
"num_tokens": 12305398.0,
|
|
"step": 5365
|
|
},
|
|
{
|
|
"entropy": 5.274507617950439,
|
|
"epoch": 0.515850144092219,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004980086773741856,
|
|
"loss": 5.2796,
|
|
"mean_token_accuracy": 0.19121709913015367,
|
|
"num_tokens": 12316582.0,
|
|
"step": 5370
|
|
},
|
|
{
|
|
"entropy": 5.483122396469116,
|
|
"epoch": 0.516330451488953,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004980041236750685,
|
|
"loss": 5.3846,
|
|
"mean_token_accuracy": 0.18809578120708464,
|
|
"num_tokens": 12328463.0,
|
|
"step": 5375
|
|
},
|
|
{
|
|
"entropy": 5.445298194885254,
|
|
"epoch": 0.5168107588856868,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004979995647984577,
|
|
"loss": 5.3698,
|
|
"mean_token_accuracy": 0.19524169117212295,
|
|
"num_tokens": 12341040.0,
|
|
"step": 5380
|
|
},
|
|
{
|
|
"entropy": 5.2983297348022464,
|
|
"epoch": 0.5172910662824207,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004979950007444593,
|
|
"loss": 5.261,
|
|
"mean_token_accuracy": 0.1934810236096382,
|
|
"num_tokens": 12353024.0,
|
|
"step": 5385
|
|
},
|
|
{
|
|
"entropy": 5.358570623397827,
|
|
"epoch": 0.5177713736791547,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004979904315131792,
|
|
"loss": 5.2844,
|
|
"mean_token_accuracy": 0.19403222799301148,
|
|
"num_tokens": 12366100.0,
|
|
"step": 5390
|
|
},
|
|
{
|
|
"entropy": 5.293501186370849,
|
|
"epoch": 0.5182516810758886,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004979858571047233,
|
|
"loss": 5.2707,
|
|
"mean_token_accuracy": 0.19768950045108796,
|
|
"num_tokens": 12377829.0,
|
|
"step": 5395
|
|
},
|
|
{
|
|
"entropy": 5.466844320297241,
|
|
"epoch": 0.5187319884726225,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004979812775191979,
|
|
"loss": 5.4031,
|
|
"mean_token_accuracy": 0.18979473859071733,
|
|
"num_tokens": 12390830.0,
|
|
"step": 5400
|
|
},
|
|
{
|
|
"entropy": 5.328051805496216,
|
|
"epoch": 0.5192122958693564,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004979766927567094,
|
|
"loss": 5.2545,
|
|
"mean_token_accuracy": 0.19470396041870117,
|
|
"num_tokens": 12401642.0,
|
|
"step": 5405
|
|
},
|
|
{
|
|
"entropy": 5.3456236839294435,
|
|
"epoch": 0.5196926032660903,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004979721028173643,
|
|
"loss": 5.3476,
|
|
"mean_token_accuracy": 0.1877232700586319,
|
|
"num_tokens": 12411653.0,
|
|
"step": 5410
|
|
},
|
|
{
|
|
"entropy": 5.386164760589599,
|
|
"epoch": 0.5201729106628242,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000497967507701269,
|
|
"loss": 5.2486,
|
|
"mean_token_accuracy": 0.20038487911224365,
|
|
"num_tokens": 12422891.0,
|
|
"step": 5415
|
|
},
|
|
{
|
|
"entropy": 5.397801113128662,
|
|
"epoch": 0.5206532180595581,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004979629074085303,
|
|
"loss": 5.3408,
|
|
"mean_token_accuracy": 0.19329493790864943,
|
|
"num_tokens": 12434190.0,
|
|
"step": 5420
|
|
},
|
|
{
|
|
"entropy": 5.424389457702636,
|
|
"epoch": 0.521133525456292,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004979583019392548,
|
|
"loss": 5.3974,
|
|
"mean_token_accuracy": 0.18989453911781312,
|
|
"num_tokens": 12445796.0,
|
|
"step": 5425
|
|
},
|
|
{
|
|
"entropy": 5.483598613739014,
|
|
"epoch": 0.521613832853026,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004979536912935497,
|
|
"loss": 5.4639,
|
|
"mean_token_accuracy": 0.18501935750246049,
|
|
"num_tokens": 12456212.0,
|
|
"step": 5430
|
|
},
|
|
{
|
|
"entropy": 5.330318355560303,
|
|
"epoch": 0.5220941402497599,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.000497949075471522,
|
|
"loss": 5.1899,
|
|
"mean_token_accuracy": 0.19820088148117065,
|
|
"num_tokens": 12467871.0,
|
|
"step": 5435
|
|
},
|
|
{
|
|
"entropy": 5.372925519943237,
|
|
"epoch": 0.5225744476464937,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004979444544732786,
|
|
"loss": 5.2819,
|
|
"mean_token_accuracy": 0.1852207139134407,
|
|
"num_tokens": 12478626.0,
|
|
"step": 5440
|
|
},
|
|
{
|
|
"entropy": 5.313206958770752,
|
|
"epoch": 0.5230547550432276,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000497939828298927,
|
|
"loss": 5.3741,
|
|
"mean_token_accuracy": 0.19033849388360977,
|
|
"num_tokens": 12491487.0,
|
|
"step": 5445
|
|
},
|
|
{
|
|
"entropy": 5.462804317474365,
|
|
"epoch": 0.5235350624399616,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004979351969485747,
|
|
"loss": 5.3383,
|
|
"mean_token_accuracy": 0.18805173933506011,
|
|
"num_tokens": 12503240.0,
|
|
"step": 5450
|
|
},
|
|
{
|
|
"entropy": 5.4243183612823485,
|
|
"epoch": 0.5240153698366955,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004979305604223291,
|
|
"loss": 5.2774,
|
|
"mean_token_accuracy": 0.1903422147035599,
|
|
"num_tokens": 12513860.0,
|
|
"step": 5455
|
|
},
|
|
{
|
|
"entropy": 5.313809871673584,
|
|
"epoch": 0.5244956772334294,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004979259187202978,
|
|
"loss": 5.352,
|
|
"mean_token_accuracy": 0.1945337176322937,
|
|
"num_tokens": 12525884.0,
|
|
"step": 5460
|
|
},
|
|
{
|
|
"entropy": 5.442373895645142,
|
|
"epoch": 0.5249759846301633,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004979212718425887,
|
|
"loss": 5.2672,
|
|
"mean_token_accuracy": 0.1932208612561226,
|
|
"num_tokens": 12536709.0,
|
|
"step": 5465
|
|
},
|
|
{
|
|
"entropy": 5.334468412399292,
|
|
"epoch": 0.5254562920268973,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004979166197893096,
|
|
"loss": 5.2663,
|
|
"mean_token_accuracy": 0.19677013605833055,
|
|
"num_tokens": 12549727.0,
|
|
"step": 5470
|
|
},
|
|
{
|
|
"entropy": 5.339883422851562,
|
|
"epoch": 0.5259365994236311,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004979119625605683,
|
|
"loss": 5.3345,
|
|
"mean_token_accuracy": 0.18942939788103103,
|
|
"num_tokens": 12562053.0,
|
|
"step": 5475
|
|
},
|
|
{
|
|
"entropy": 5.287409067153931,
|
|
"epoch": 0.526416906820365,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004979073001564734,
|
|
"loss": 5.2257,
|
|
"mean_token_accuracy": 0.20170782059431075,
|
|
"num_tokens": 12574096.0,
|
|
"step": 5480
|
|
},
|
|
{
|
|
"entropy": 5.40628571510315,
|
|
"epoch": 0.5268972142170989,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004979026325771328,
|
|
"loss": 5.4013,
|
|
"mean_token_accuracy": 0.18865474164485932,
|
|
"num_tokens": 12585416.0,
|
|
"step": 5485
|
|
},
|
|
{
|
|
"entropy": 5.369120025634766,
|
|
"epoch": 0.5273775216138329,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004978979598226549,
|
|
"loss": 5.2525,
|
|
"mean_token_accuracy": 0.1964880034327507,
|
|
"num_tokens": 12596861.0,
|
|
"step": 5490
|
|
},
|
|
{
|
|
"entropy": 5.307511520385742,
|
|
"epoch": 0.5278578290105668,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004978932818931483,
|
|
"loss": 5.2672,
|
|
"mean_token_accuracy": 0.19722044318914414,
|
|
"num_tokens": 12607761.0,
|
|
"step": 5495
|
|
},
|
|
{
|
|
"entropy": 5.4275431632995605,
|
|
"epoch": 0.5283381364073007,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004978885987887216,
|
|
"loss": 5.3898,
|
|
"mean_token_accuracy": 0.19588741660118103,
|
|
"num_tokens": 12619889.0,
|
|
"step": 5500
|
|
},
|
|
{
|
|
"entropy": 5.4371997833251955,
|
|
"epoch": 0.5288184438040345,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004978839105094833,
|
|
"loss": 5.3606,
|
|
"mean_token_accuracy": 0.19224700778722764,
|
|
"num_tokens": 12630604.0,
|
|
"step": 5505
|
|
},
|
|
{
|
|
"entropy": 5.222589921951294,
|
|
"epoch": 0.5292987512007685,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004978792170555426,
|
|
"loss": 5.2618,
|
|
"mean_token_accuracy": 0.19633477181196213,
|
|
"num_tokens": 12641172.0,
|
|
"step": 5510
|
|
},
|
|
{
|
|
"entropy": 5.292724561691284,
|
|
"epoch": 0.5297790585975024,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004978745184270083,
|
|
"loss": 5.1601,
|
|
"mean_token_accuracy": 0.20660953521728515,
|
|
"num_tokens": 12651731.0,
|
|
"step": 5515
|
|
},
|
|
{
|
|
"entropy": 5.392834901809692,
|
|
"epoch": 0.5302593659942363,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004978698146239893,
|
|
"loss": 5.2978,
|
|
"mean_token_accuracy": 0.1936490774154663,
|
|
"num_tokens": 12663050.0,
|
|
"step": 5520
|
|
},
|
|
{
|
|
"entropy": 5.409347009658814,
|
|
"epoch": 0.5307396733909702,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004978651056465952,
|
|
"loss": 5.3862,
|
|
"mean_token_accuracy": 0.18999682515859603,
|
|
"num_tokens": 12674732.0,
|
|
"step": 5525
|
|
},
|
|
{
|
|
"entropy": 5.332290983200073,
|
|
"epoch": 0.5312199807877042,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000497860391494935,
|
|
"loss": 5.2171,
|
|
"mean_token_accuracy": 0.19382983297109604,
|
|
"num_tokens": 12685981.0,
|
|
"step": 5530
|
|
},
|
|
{
|
|
"entropy": 5.412051010131836,
|
|
"epoch": 0.531700288184438,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004978556721691183,
|
|
"loss": 5.3525,
|
|
"mean_token_accuracy": 0.19065555483102797,
|
|
"num_tokens": 12697139.0,
|
|
"step": 5535
|
|
},
|
|
{
|
|
"entropy": 5.317591810226441,
|
|
"epoch": 0.5321805955811719,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004978509476692547,
|
|
"loss": 5.2966,
|
|
"mean_token_accuracy": 0.18611351698637008,
|
|
"num_tokens": 12708268.0,
|
|
"step": 5540
|
|
},
|
|
{
|
|
"entropy": 5.375318956375122,
|
|
"epoch": 0.5326609029779059,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004978462179954538,
|
|
"loss": 5.2958,
|
|
"mean_token_accuracy": 0.18993753045797349,
|
|
"num_tokens": 12720715.0,
|
|
"step": 5545
|
|
},
|
|
{
|
|
"entropy": 5.3367125511169435,
|
|
"epoch": 0.5331412103746398,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004978414831478253,
|
|
"loss": 5.269,
|
|
"mean_token_accuracy": 0.19713337272405623,
|
|
"num_tokens": 12732409.0,
|
|
"step": 5550
|
|
},
|
|
{
|
|
"entropy": 5.323969554901123,
|
|
"epoch": 0.5336215177713737,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004978367431264794,
|
|
"loss": 5.397,
|
|
"mean_token_accuracy": 0.18209069669246675,
|
|
"num_tokens": 12745174.0,
|
|
"step": 5555
|
|
},
|
|
{
|
|
"entropy": 5.410878992080688,
|
|
"epoch": 0.5341018251681076,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004978319979315261,
|
|
"loss": 5.3328,
|
|
"mean_token_accuracy": 0.19573558866977692,
|
|
"num_tokens": 12756116.0,
|
|
"step": 5560
|
|
},
|
|
{
|
|
"entropy": 5.376229763031006,
|
|
"epoch": 0.5345821325648416,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004978272475630752,
|
|
"loss": 5.2851,
|
|
"mean_token_accuracy": 0.1916971653699875,
|
|
"num_tokens": 12768183.0,
|
|
"step": 5565
|
|
},
|
|
{
|
|
"entropy": 5.264455699920655,
|
|
"epoch": 0.5350624399615754,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004978224920212374,
|
|
"loss": 5.2931,
|
|
"mean_token_accuracy": 0.1934914067387581,
|
|
"num_tokens": 12778537.0,
|
|
"step": 5570
|
|
},
|
|
{
|
|
"entropy": 5.313297891616822,
|
|
"epoch": 0.5355427473583093,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004978177313061232,
|
|
"loss": 5.3228,
|
|
"mean_token_accuracy": 0.19088124930858613,
|
|
"num_tokens": 12789691.0,
|
|
"step": 5575
|
|
},
|
|
{
|
|
"entropy": 5.473337554931641,
|
|
"epoch": 0.5360230547550432,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004978129654178426,
|
|
"loss": 5.3433,
|
|
"mean_token_accuracy": 0.18791570216417314,
|
|
"num_tokens": 12801438.0,
|
|
"step": 5580
|
|
},
|
|
{
|
|
"entropy": 5.4069455623626705,
|
|
"epoch": 0.5365033621517772,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004978081943565067,
|
|
"loss": 5.3061,
|
|
"mean_token_accuracy": 0.18656288981437683,
|
|
"num_tokens": 12812425.0,
|
|
"step": 5585
|
|
},
|
|
{
|
|
"entropy": 5.307536172866821,
|
|
"epoch": 0.5369836695485111,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004978034181222261,
|
|
"loss": 5.2769,
|
|
"mean_token_accuracy": 0.18625542372465134,
|
|
"num_tokens": 12824735.0,
|
|
"step": 5590
|
|
},
|
|
{
|
|
"entropy": 5.430880117416382,
|
|
"epoch": 0.537463976945245,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004977986367151119,
|
|
"loss": 5.3688,
|
|
"mean_token_accuracy": 0.1952778786420822,
|
|
"num_tokens": 12835454.0,
|
|
"step": 5595
|
|
},
|
|
{
|
|
"entropy": 5.434065580368042,
|
|
"epoch": 0.5379442843419788,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004977938501352747,
|
|
"loss": 5.4122,
|
|
"mean_token_accuracy": 0.18514797538518907,
|
|
"num_tokens": 12847086.0,
|
|
"step": 5600
|
|
},
|
|
{
|
|
"entropy": 5.385431623458862,
|
|
"epoch": 0.5384245917387128,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004977890583828259,
|
|
"loss": 5.3549,
|
|
"mean_token_accuracy": 0.1888865575194359,
|
|
"num_tokens": 12857713.0,
|
|
"step": 5605
|
|
},
|
|
{
|
|
"entropy": 5.36136646270752,
|
|
"epoch": 0.5389048991354467,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004977842614578768,
|
|
"loss": 5.3356,
|
|
"mean_token_accuracy": 0.18914903849363326,
|
|
"num_tokens": 12869967.0,
|
|
"step": 5610
|
|
},
|
|
{
|
|
"entropy": 5.433460998535156,
|
|
"epoch": 0.5393852065321806,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004977794593605386,
|
|
"loss": 5.3684,
|
|
"mean_token_accuracy": 0.18960850983858107,
|
|
"num_tokens": 12881230.0,
|
|
"step": 5615
|
|
},
|
|
{
|
|
"entropy": 5.352547121047974,
|
|
"epoch": 0.5398655139289145,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000497774652090923,
|
|
"loss": 5.3222,
|
|
"mean_token_accuracy": 0.18944347649812698,
|
|
"num_tokens": 12892376.0,
|
|
"step": 5620
|
|
},
|
|
{
|
|
"entropy": 5.436691570281982,
|
|
"epoch": 0.5403458213256485,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004977698396491414,
|
|
"loss": 5.3307,
|
|
"mean_token_accuracy": 0.19240753799676896,
|
|
"num_tokens": 12903709.0,
|
|
"step": 5625
|
|
},
|
|
{
|
|
"entropy": 5.2928542137146,
|
|
"epoch": 0.5408261287223823,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004977650220353055,
|
|
"loss": 5.1629,
|
|
"mean_token_accuracy": 0.19530351608991622,
|
|
"num_tokens": 12914958.0,
|
|
"step": 5630
|
|
},
|
|
{
|
|
"entropy": 5.280749416351318,
|
|
"epoch": 0.5413064361191162,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004977601992495274,
|
|
"loss": 5.2875,
|
|
"mean_token_accuracy": 0.1923414632678032,
|
|
"num_tokens": 12927418.0,
|
|
"step": 5635
|
|
},
|
|
{
|
|
"entropy": 5.413435602188111,
|
|
"epoch": 0.5417867435158501,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004977553712919189,
|
|
"loss": 5.3325,
|
|
"mean_token_accuracy": 0.1892315372824669,
|
|
"num_tokens": 12939874.0,
|
|
"step": 5640
|
|
},
|
|
{
|
|
"entropy": 5.463119792938232,
|
|
"epoch": 0.5422670509125841,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004977505381625921,
|
|
"loss": 5.3542,
|
|
"mean_token_accuracy": 0.18793897628784179,
|
|
"num_tokens": 12951113.0,
|
|
"step": 5645
|
|
},
|
|
{
|
|
"entropy": 5.333239316940308,
|
|
"epoch": 0.542747358309318,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004977456998616593,
|
|
"loss": 5.247,
|
|
"mean_token_accuracy": 0.19487171471118928,
|
|
"num_tokens": 12961940.0,
|
|
"step": 5650
|
|
},
|
|
{
|
|
"entropy": 5.247047281265258,
|
|
"epoch": 0.5432276657060519,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004977408563892327,
|
|
"loss": 5.2389,
|
|
"mean_token_accuracy": 0.19528348445892335,
|
|
"num_tokens": 12973938.0,
|
|
"step": 5655
|
|
},
|
|
{
|
|
"entropy": 5.355054330825806,
|
|
"epoch": 0.5437079731027857,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004977360077454249,
|
|
"loss": 5.2669,
|
|
"mean_token_accuracy": 0.19261687248945236,
|
|
"num_tokens": 12985400.0,
|
|
"step": 5660
|
|
},
|
|
{
|
|
"entropy": 5.381504774093628,
|
|
"epoch": 0.5441882804995197,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004977311539303483,
|
|
"loss": 5.2984,
|
|
"mean_token_accuracy": 0.202898870408535,
|
|
"num_tokens": 12996402.0,
|
|
"step": 5665
|
|
},
|
|
{
|
|
"entropy": 5.339759063720703,
|
|
"epoch": 0.5446685878962536,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004977262949441158,
|
|
"loss": 5.1882,
|
|
"mean_token_accuracy": 0.20247950553894042,
|
|
"num_tokens": 13006991.0,
|
|
"step": 5670
|
|
},
|
|
{
|
|
"entropy": 5.329454803466797,
|
|
"epoch": 0.5451488952929875,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004977214307868399,
|
|
"loss": 5.2909,
|
|
"mean_token_accuracy": 0.19646303355693817,
|
|
"num_tokens": 13016969.0,
|
|
"step": 5675
|
|
},
|
|
{
|
|
"entropy": 5.333616399765015,
|
|
"epoch": 0.5456292026897214,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.000497716561458634,
|
|
"loss": 5.2395,
|
|
"mean_token_accuracy": 0.1989587128162384,
|
|
"num_tokens": 13027759.0,
|
|
"step": 5680
|
|
},
|
|
{
|
|
"entropy": 5.4932708740234375,
|
|
"epoch": 0.5461095100864554,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004977116869596107,
|
|
"loss": 5.4415,
|
|
"mean_token_accuracy": 0.1860479310154915,
|
|
"num_tokens": 13039881.0,
|
|
"step": 5685
|
|
},
|
|
{
|
|
"entropy": 5.399776601791382,
|
|
"epoch": 0.5465898174831892,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004977068072898834,
|
|
"loss": 5.3041,
|
|
"mean_token_accuracy": 0.18947898745536804,
|
|
"num_tokens": 13051443.0,
|
|
"step": 5690
|
|
},
|
|
{
|
|
"entropy": 5.3822290897369385,
|
|
"epoch": 0.5470701248799231,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004977019224495652,
|
|
"loss": 5.3697,
|
|
"mean_token_accuracy": 0.18962922990322112,
|
|
"num_tokens": 13063474.0,
|
|
"step": 5695
|
|
},
|
|
{
|
|
"entropy": 5.307476902008057,
|
|
"epoch": 0.547550432276657,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004976970324387698,
|
|
"loss": 5.234,
|
|
"mean_token_accuracy": 0.20077043473720552,
|
|
"num_tokens": 13074365.0,
|
|
"step": 5700
|
|
},
|
|
{
|
|
"entropy": 5.339881372451782,
|
|
"epoch": 0.548030739673391,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004976921372576104,
|
|
"loss": 5.3033,
|
|
"mean_token_accuracy": 0.19367703795433044,
|
|
"num_tokens": 13087354.0,
|
|
"step": 5705
|
|
},
|
|
{
|
|
"entropy": 5.32935528755188,
|
|
"epoch": 0.5485110470701249,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004976872369062011,
|
|
"loss": 5.2787,
|
|
"mean_token_accuracy": 0.19071510583162307,
|
|
"num_tokens": 13099306.0,
|
|
"step": 5710
|
|
},
|
|
{
|
|
"entropy": 5.4302033424377445,
|
|
"epoch": 0.5489913544668588,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004976823313846552,
|
|
"loss": 5.4164,
|
|
"mean_token_accuracy": 0.19036435931921006,
|
|
"num_tokens": 13111259.0,
|
|
"step": 5715
|
|
},
|
|
{
|
|
"entropy": 5.4693896770477295,
|
|
"epoch": 0.5494716618635928,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004976774206930869,
|
|
"loss": 5.3256,
|
|
"mean_token_accuracy": 0.18587163984775543,
|
|
"num_tokens": 13123589.0,
|
|
"step": 5720
|
|
},
|
|
{
|
|
"entropy": 5.253912925720215,
|
|
"epoch": 0.5499519692603266,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004976725048316101,
|
|
"loss": 5.322,
|
|
"mean_token_accuracy": 0.19089159667491912,
|
|
"num_tokens": 13136485.0,
|
|
"step": 5725
|
|
},
|
|
{
|
|
"entropy": 5.40102801322937,
|
|
"epoch": 0.5504322766570605,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004976675838003388,
|
|
"loss": 5.2997,
|
|
"mean_token_accuracy": 0.19145811647176741,
|
|
"num_tokens": 13148067.0,
|
|
"step": 5730
|
|
},
|
|
{
|
|
"entropy": 5.367999935150147,
|
|
"epoch": 0.5509125840537944,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004976626575993877,
|
|
"loss": 5.2818,
|
|
"mean_token_accuracy": 0.18961854726076127,
|
|
"num_tokens": 13159813.0,
|
|
"step": 5735
|
|
},
|
|
{
|
|
"entropy": 5.410087442398071,
|
|
"epoch": 0.5513928914505284,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004976577262288705,
|
|
"loss": 5.356,
|
|
"mean_token_accuracy": 0.18928916603326798,
|
|
"num_tokens": 13170828.0,
|
|
"step": 5740
|
|
},
|
|
{
|
|
"entropy": 5.265670728683472,
|
|
"epoch": 0.5518731988472623,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004976527896889023,
|
|
"loss": 5.181,
|
|
"mean_token_accuracy": 0.20403801798820495,
|
|
"num_tokens": 13181883.0,
|
|
"step": 5745
|
|
},
|
|
{
|
|
"entropy": 5.295314884185791,
|
|
"epoch": 0.5523535062439962,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004976478479795974,
|
|
"loss": 5.2557,
|
|
"mean_token_accuracy": 0.1949864685535431,
|
|
"num_tokens": 13193530.0,
|
|
"step": 5750
|
|
},
|
|
{
|
|
"entropy": 5.484155082702637,
|
|
"epoch": 0.55283381364073,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004976429011010706,
|
|
"loss": 5.4823,
|
|
"mean_token_accuracy": 0.17912757843732835,
|
|
"num_tokens": 13205822.0,
|
|
"step": 5755
|
|
},
|
|
{
|
|
"entropy": 5.3539347648620605,
|
|
"epoch": 0.553314121037464,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004976379490534366,
|
|
"loss": 5.2081,
|
|
"mean_token_accuracy": 0.19992550164461137,
|
|
"num_tokens": 13216698.0,
|
|
"step": 5760
|
|
},
|
|
{
|
|
"entropy": 5.291062736511231,
|
|
"epoch": 0.5537944284341979,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004976329918368107,
|
|
"loss": 5.2968,
|
|
"mean_token_accuracy": 0.19075367897748946,
|
|
"num_tokens": 13228389.0,
|
|
"step": 5765
|
|
},
|
|
{
|
|
"entropy": 5.433424997329712,
|
|
"epoch": 0.5542747358309318,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004976280294513079,
|
|
"loss": 5.3505,
|
|
"mean_token_accuracy": 0.18287664502859116,
|
|
"num_tokens": 13239628.0,
|
|
"step": 5770
|
|
},
|
|
{
|
|
"entropy": 5.404953861236573,
|
|
"epoch": 0.5547550432276657,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004976230618970431,
|
|
"loss": 5.352,
|
|
"mean_token_accuracy": 0.19548004865646362,
|
|
"num_tokens": 13251149.0,
|
|
"step": 5775
|
|
},
|
|
{
|
|
"entropy": 5.455016326904297,
|
|
"epoch": 0.5552353506243997,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000497618089174132,
|
|
"loss": 5.413,
|
|
"mean_token_accuracy": 0.18660195618867875,
|
|
"num_tokens": 13264846.0,
|
|
"step": 5780
|
|
},
|
|
{
|
|
"entropy": 5.248121690750122,
|
|
"epoch": 0.5557156580211335,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004976131112826898,
|
|
"loss": 5.1913,
|
|
"mean_token_accuracy": 0.2054605171084404,
|
|
"num_tokens": 13275409.0,
|
|
"step": 5785
|
|
},
|
|
{
|
|
"entropy": 5.259016036987305,
|
|
"epoch": 0.5561959654178674,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004976081282228323,
|
|
"loss": 5.1657,
|
|
"mean_token_accuracy": 0.20358884781599046,
|
|
"num_tokens": 13287173.0,
|
|
"step": 5790
|
|
},
|
|
{
|
|
"entropy": 5.411679124832153,
|
|
"epoch": 0.5566762728146013,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000497603139994675,
|
|
"loss": 5.2377,
|
|
"mean_token_accuracy": 0.19680293649435043,
|
|
"num_tokens": 13298225.0,
|
|
"step": 5795
|
|
},
|
|
{
|
|
"entropy": 5.2930761814117435,
|
|
"epoch": 0.5571565802113353,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004975981465983338,
|
|
"loss": 5.2468,
|
|
"mean_token_accuracy": 0.19053254425525665,
|
|
"num_tokens": 13309685.0,
|
|
"step": 5800
|
|
},
|
|
{
|
|
"entropy": 5.304633331298828,
|
|
"epoch": 0.5576368876080692,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004975931480339246,
|
|
"loss": 5.2554,
|
|
"mean_token_accuracy": 0.19651708900928497,
|
|
"num_tokens": 13320837.0,
|
|
"step": 5805
|
|
},
|
|
{
|
|
"entropy": 5.383905267715454,
|
|
"epoch": 0.5581171950048031,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004975881443015635,
|
|
"loss": 5.3718,
|
|
"mean_token_accuracy": 0.19027461260557174,
|
|
"num_tokens": 13333512.0,
|
|
"step": 5810
|
|
},
|
|
{
|
|
"entropy": 5.465289068222046,
|
|
"epoch": 0.5585975024015369,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004975831354013667,
|
|
"loss": 5.3829,
|
|
"mean_token_accuracy": 0.19368760734796525,
|
|
"num_tokens": 13345189.0,
|
|
"step": 5815
|
|
},
|
|
{
|
|
"entropy": 5.329316329956055,
|
|
"epoch": 0.5590778097982709,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004975781213334503,
|
|
"loss": 5.2472,
|
|
"mean_token_accuracy": 0.20152513086795806,
|
|
"num_tokens": 13356123.0,
|
|
"step": 5820
|
|
},
|
|
{
|
|
"entropy": 5.329442405700684,
|
|
"epoch": 0.5595581171950048,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004975731020979309,
|
|
"loss": 5.2949,
|
|
"mean_token_accuracy": 0.19351785629987717,
|
|
"num_tokens": 13366902.0,
|
|
"step": 5825
|
|
},
|
|
{
|
|
"entropy": 5.4559613227844235,
|
|
"epoch": 0.5600384245917387,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004975680776949249,
|
|
"loss": 5.3542,
|
|
"mean_token_accuracy": 0.18989898711442948,
|
|
"num_tokens": 13377567.0,
|
|
"step": 5830
|
|
},
|
|
{
|
|
"entropy": 5.390386629104614,
|
|
"epoch": 0.5605187319884726,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004975630481245492,
|
|
"loss": 5.2869,
|
|
"mean_token_accuracy": 0.2009364992380142,
|
|
"num_tokens": 13387297.0,
|
|
"step": 5835
|
|
},
|
|
{
|
|
"entropy": 5.348505544662475,
|
|
"epoch": 0.5609990393852066,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004975580133869202,
|
|
"loss": 5.3381,
|
|
"mean_token_accuracy": 0.1932346299290657,
|
|
"num_tokens": 13397723.0,
|
|
"step": 5840
|
|
},
|
|
{
|
|
"entropy": 5.408625984191895,
|
|
"epoch": 0.5614793467819404,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004975529734821552,
|
|
"loss": 5.3863,
|
|
"mean_token_accuracy": 0.18635910749435425,
|
|
"num_tokens": 13409875.0,
|
|
"step": 5845
|
|
},
|
|
{
|
|
"entropy": 5.352054500579834,
|
|
"epoch": 0.5619596541786743,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004975479284103708,
|
|
"loss": 5.2921,
|
|
"mean_token_accuracy": 0.1954024314880371,
|
|
"num_tokens": 13421338.0,
|
|
"step": 5850
|
|
},
|
|
{
|
|
"entropy": 5.418287992477417,
|
|
"epoch": 0.5624399615754082,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004975428781716845,
|
|
"loss": 5.3258,
|
|
"mean_token_accuracy": 0.19152757823467254,
|
|
"num_tokens": 13431373.0,
|
|
"step": 5855
|
|
},
|
|
{
|
|
"entropy": 5.360725784301758,
|
|
"epoch": 0.5629202689721422,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004975378227662134,
|
|
"loss": 5.3208,
|
|
"mean_token_accuracy": 0.19721843004226686,
|
|
"num_tokens": 13443158.0,
|
|
"step": 5860
|
|
},
|
|
{
|
|
"entropy": 5.44525113105774,
|
|
"epoch": 0.5634005763688761,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004975327621940746,
|
|
"loss": 5.3795,
|
|
"mean_token_accuracy": 0.18757863938808442,
|
|
"num_tokens": 13454559.0,
|
|
"step": 5865
|
|
},
|
|
{
|
|
"entropy": 5.453475904464722,
|
|
"epoch": 0.56388088376561,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004975276964553861,
|
|
"loss": 5.4604,
|
|
"mean_token_accuracy": 0.1895272508263588,
|
|
"num_tokens": 13466934.0,
|
|
"step": 5870
|
|
},
|
|
{
|
|
"entropy": 5.349884796142578,
|
|
"epoch": 0.5643611911623438,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004975226255502651,
|
|
"loss": 5.2124,
|
|
"mean_token_accuracy": 0.20376883447170258,
|
|
"num_tokens": 13477770.0,
|
|
"step": 5875
|
|
},
|
|
{
|
|
"entropy": 5.428862237930298,
|
|
"epoch": 0.5648414985590778,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004975175494788297,
|
|
"loss": 5.4214,
|
|
"mean_token_accuracy": 0.1833633303642273,
|
|
"num_tokens": 13490093.0,
|
|
"step": 5880
|
|
},
|
|
{
|
|
"entropy": 5.4273130893707275,
|
|
"epoch": 0.5653218059558117,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004975124682411974,
|
|
"loss": 5.2743,
|
|
"mean_token_accuracy": 0.19006698280572892,
|
|
"num_tokens": 13500663.0,
|
|
"step": 5885
|
|
},
|
|
{
|
|
"entropy": 5.404650068283081,
|
|
"epoch": 0.5658021133525456,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004975073818374863,
|
|
"loss": 5.3747,
|
|
"mean_token_accuracy": 0.19194794446229935,
|
|
"num_tokens": 13512369.0,
|
|
"step": 5890
|
|
},
|
|
{
|
|
"entropy": 5.352162408828735,
|
|
"epoch": 0.5662824207492796,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004975022902678145,
|
|
"loss": 5.2518,
|
|
"mean_token_accuracy": 0.18981288820505143,
|
|
"num_tokens": 13523181.0,
|
|
"step": 5895
|
|
},
|
|
{
|
|
"entropy": 5.307896852493286,
|
|
"epoch": 0.5667627281460135,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004974971935323003,
|
|
"loss": 5.2062,
|
|
"mean_token_accuracy": 0.19488532990217208,
|
|
"num_tokens": 13534113.0,
|
|
"step": 5900
|
|
},
|
|
{
|
|
"entropy": 5.3025891304016115,
|
|
"epoch": 0.5672430355427474,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004974920916310619,
|
|
"loss": 5.2425,
|
|
"mean_token_accuracy": 0.19460777193307877,
|
|
"num_tokens": 13545037.0,
|
|
"step": 5905
|
|
},
|
|
{
|
|
"entropy": 5.368872261047363,
|
|
"epoch": 0.5677233429394812,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004974869845642178,
|
|
"loss": 5.2926,
|
|
"mean_token_accuracy": 0.19421349167823793,
|
|
"num_tokens": 13555541.0,
|
|
"step": 5910
|
|
},
|
|
{
|
|
"entropy": 5.389457654953003,
|
|
"epoch": 0.5682036503362152,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004974818723318866,
|
|
"loss": 5.2973,
|
|
"mean_token_accuracy": 0.19764145314693451,
|
|
"num_tokens": 13566951.0,
|
|
"step": 5915
|
|
},
|
|
{
|
|
"entropy": 5.347638368606567,
|
|
"epoch": 0.5686839577329491,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004974767549341868,
|
|
"loss": 5.3505,
|
|
"mean_token_accuracy": 0.18888978958129882,
|
|
"num_tokens": 13578492.0,
|
|
"step": 5920
|
|
},
|
|
{
|
|
"entropy": 5.425949621200561,
|
|
"epoch": 0.569164265129683,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004974716323712376,
|
|
"loss": 5.2433,
|
|
"mean_token_accuracy": 0.20290264040231704,
|
|
"num_tokens": 13589183.0,
|
|
"step": 5925
|
|
},
|
|
{
|
|
"entropy": 5.37887659072876,
|
|
"epoch": 0.5696445725264169,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004974665046431576,
|
|
"loss": 5.3868,
|
|
"mean_token_accuracy": 0.19258931577205657,
|
|
"num_tokens": 13600588.0,
|
|
"step": 5930
|
|
},
|
|
{
|
|
"entropy": 5.309185123443603,
|
|
"epoch": 0.5701248799231509,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004974613717500659,
|
|
"loss": 5.2605,
|
|
"mean_token_accuracy": 0.20295644104480742,
|
|
"num_tokens": 13612107.0,
|
|
"step": 5935
|
|
},
|
|
{
|
|
"entropy": 5.485657453536987,
|
|
"epoch": 0.5706051873198847,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004974562336920818,
|
|
"loss": 5.4246,
|
|
"mean_token_accuracy": 0.18908909112215042,
|
|
"num_tokens": 13623973.0,
|
|
"step": 5940
|
|
},
|
|
{
|
|
"entropy": 5.3633698463439945,
|
|
"epoch": 0.5710854947166186,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004974510904693245,
|
|
"loss": 5.2372,
|
|
"mean_token_accuracy": 0.19648284167051316,
|
|
"num_tokens": 13634994.0,
|
|
"step": 5945
|
|
},
|
|
{
|
|
"entropy": 5.412157249450684,
|
|
"epoch": 0.5715658021133525,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004974459420819134,
|
|
"loss": 5.3895,
|
|
"mean_token_accuracy": 0.19440043568611146,
|
|
"num_tokens": 13646361.0,
|
|
"step": 5950
|
|
},
|
|
{
|
|
"entropy": 5.36341814994812,
|
|
"epoch": 0.5720461095100865,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000497440788529968,
|
|
"loss": 5.2834,
|
|
"mean_token_accuracy": 0.19329349249601363,
|
|
"num_tokens": 13656975.0,
|
|
"step": 5955
|
|
},
|
|
{
|
|
"entropy": 5.428890562057495,
|
|
"epoch": 0.5725264169068204,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004974356298136081,
|
|
"loss": 5.3207,
|
|
"mean_token_accuracy": 0.18961571753025055,
|
|
"num_tokens": 13668434.0,
|
|
"step": 5960
|
|
},
|
|
{
|
|
"entropy": 5.403112125396729,
|
|
"epoch": 0.5730067243035543,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004974304659329533,
|
|
"loss": 5.301,
|
|
"mean_token_accuracy": 0.1921529397368431,
|
|
"num_tokens": 13679266.0,
|
|
"step": 5965
|
|
},
|
|
{
|
|
"entropy": 5.291449975967407,
|
|
"epoch": 0.5734870317002881,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004974252968881236,
|
|
"loss": 5.3247,
|
|
"mean_token_accuracy": 0.18704658299684523,
|
|
"num_tokens": 13690921.0,
|
|
"step": 5970
|
|
},
|
|
{
|
|
"entropy": 5.385117483139038,
|
|
"epoch": 0.5739673390970221,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.000497420122679239,
|
|
"loss": 5.2579,
|
|
"mean_token_accuracy": 0.19390686601400375,
|
|
"num_tokens": 13702329.0,
|
|
"step": 5975
|
|
},
|
|
{
|
|
"entropy": 5.317170143127441,
|
|
"epoch": 0.574447646493756,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004974149433064196,
|
|
"loss": 5.2295,
|
|
"mean_token_accuracy": 0.20150385797023773,
|
|
"num_tokens": 13713356.0,
|
|
"step": 5980
|
|
},
|
|
{
|
|
"entropy": 5.237676763534546,
|
|
"epoch": 0.5749279538904899,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004974097587697856,
|
|
"loss": 5.2294,
|
|
"mean_token_accuracy": 0.19473931789398194,
|
|
"num_tokens": 13724718.0,
|
|
"step": 5985
|
|
},
|
|
{
|
|
"entropy": 5.28824028968811,
|
|
"epoch": 0.5754082612872238,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004974045690694575,
|
|
"loss": 5.2596,
|
|
"mean_token_accuracy": 0.196784345805645,
|
|
"num_tokens": 13736113.0,
|
|
"step": 5990
|
|
},
|
|
{
|
|
"entropy": 5.417406034469605,
|
|
"epoch": 0.5758885686839578,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004973993742055557,
|
|
"loss": 5.272,
|
|
"mean_token_accuracy": 0.19672393202781677,
|
|
"num_tokens": 13748322.0,
|
|
"step": 5995
|
|
},
|
|
{
|
|
"entropy": 5.3009929180145265,
|
|
"epoch": 0.5763688760806917,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004973941741782007,
|
|
"loss": 5.2743,
|
|
"mean_token_accuracy": 0.18973211497068404,
|
|
"num_tokens": 13759433.0,
|
|
"step": 6000
|
|
},
|
|
{
|
|
"epoch": 0.5763688760806917,
|
|
"eval_entropy": 5.216975544093005,
|
|
"eval_loss": 5.320178508758545,
|
|
"eval_mean_token_accuracy": 0.1993778554485636,
|
|
"eval_num_tokens": 13759433.0,
|
|
"eval_runtime": 27.3927,
|
|
"eval_samples_per_second": 1197.949,
|
|
"eval_steps_per_second": 149.748,
|
|
"step": 6000
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 104090,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 10,
|
|
"save_steps": 3000,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 2.1108419647488e+16,
|
|
"train_batch_size": 16,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|